rocksdb-native 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. package/binding.c +92 -10
  2. package/index.js +9 -0
  3. package/lib/batch.js +11 -1
  4. package/lib/iterator.js +3 -1
  5. package/lib/snapshot.js +21 -0
  6. package/package.json +1 -1
  7. package/prebuilds/darwin-arm64/rocksdb-native.bare +0 -0
  8. package/prebuilds/darwin-arm64/rocksdb-native.node +0 -0
  9. package/prebuilds/darwin-x64/rocksdb-native.bare +0 -0
  10. package/prebuilds/darwin-x64/rocksdb-native.node +0 -0
  11. package/prebuilds/linux-arm64/rocksdb-native.bare +0 -0
  12. package/prebuilds/linux-arm64/rocksdb-native.node +0 -0
  13. package/prebuilds/linux-x64/rocksdb-native.bare +0 -0
  14. package/prebuilds/linux-x64/rocksdb-native.node +0 -0
  15. package/prebuilds/win32-x64/rocksdb-native.bare +0 -0
  16. package/prebuilds/win32-x64/rocksdb-native.node +0 -0
  17. package/vendor/librocksdb/include/rocksdb.h +38 -4
  18. package/vendor/librocksdb/src/rocksdb.cc +114 -14
  19. package/vendor/librocksdb/vendor/rocksdb/CMakeLists.txt +21 -4
  20. package/vendor/librocksdb/vendor/rocksdb/cache/secondary_cache_adapter.cc +6 -3
  21. package/vendor/librocksdb/vendor/rocksdb/db/arena_wrapped_db_iter.cc +4 -4
  22. package/vendor/librocksdb/vendor/rocksdb/db/arena_wrapped_db_iter.h +4 -2
  23. package/vendor/librocksdb/vendor/rocksdb/db/attribute_group_iterator_impl.cc +20 -0
  24. package/vendor/librocksdb/vendor/rocksdb/db/attribute_group_iterator_impl.h +83 -0
  25. package/vendor/librocksdb/vendor/rocksdb/db/builder.cc +9 -5
  26. package/vendor/librocksdb/vendor/rocksdb/db/builder.h +1 -1
  27. package/vendor/librocksdb/vendor/rocksdb/db/c.cc +231 -6
  28. package/vendor/librocksdb/vendor/rocksdb/db/c_test.c +202 -2
  29. package/vendor/librocksdb/vendor/rocksdb/db/coalescing_iterator.cc +47 -0
  30. package/vendor/librocksdb/vendor/rocksdb/db/coalescing_iterator.h +79 -0
  31. package/vendor/librocksdb/vendor/rocksdb/db/column_family.cc +28 -0
  32. package/vendor/librocksdb/vendor/rocksdb/db/column_family.h +17 -0
  33. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction.cc +8 -1
  34. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction.h +11 -9
  35. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_iterator.cc +50 -23
  36. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_iterator.h +13 -0
  37. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_job.cc +22 -25
  38. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_job.h +2 -0
  39. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_outputs.cc +8 -1
  40. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_outputs.h +1 -0
  41. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_picker.cc +40 -17
  42. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_picker.h +20 -14
  43. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_picker_level.cc +11 -6
  44. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_picker_universal.cc +77 -24
  45. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_service_job.cc +2 -0
  46. package/vendor/librocksdb/vendor/rocksdb/db/convenience.cc +3 -0
  47. package/vendor/librocksdb/vendor/rocksdb/db/db_filesnapshot.cc +125 -31
  48. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl.cc +457 -231
  49. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl.h +172 -73
  50. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_compaction_flush.cc +152 -133
  51. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  52. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_files.cc +58 -52
  53. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_follower.cc +348 -0
  54. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_follower.h +54 -0
  55. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_open.cc +136 -117
  56. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_secondary.cc +4 -3
  57. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_secondary.h +7 -6
  58. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_write.cc +134 -80
  59. package/vendor/librocksdb/vendor/rocksdb/db/db_iter.cc +11 -0
  60. package/vendor/librocksdb/vendor/rocksdb/db/db_test2.cc +1 -1
  61. package/vendor/librocksdb/vendor/rocksdb/db/db_test_util.cc +11 -1
  62. package/vendor/librocksdb/vendor/rocksdb/db/db_test_util.h +11 -7
  63. package/vendor/librocksdb/vendor/rocksdb/db/dbformat.cc +19 -4
  64. package/vendor/librocksdb/vendor/rocksdb/db/dbformat.h +3 -2
  65. package/vendor/librocksdb/vendor/rocksdb/db/error_handler.cc +34 -39
  66. package/vendor/librocksdb/vendor/rocksdb/db/error_handler.h +3 -4
  67. package/vendor/librocksdb/vendor/rocksdb/db/event_helpers.cc +6 -3
  68. package/vendor/librocksdb/vendor/rocksdb/db/experimental.cc +3 -2
  69. package/vendor/librocksdb/vendor/rocksdb/db/external_sst_file_ingestion_job.cc +76 -18
  70. package/vendor/librocksdb/vendor/rocksdb/db/external_sst_file_ingestion_job.h +11 -0
  71. package/vendor/librocksdb/vendor/rocksdb/db/flush_job.cc +37 -5
  72. package/vendor/librocksdb/vendor/rocksdb/db/flush_job.h +14 -0
  73. package/vendor/librocksdb/vendor/rocksdb/db/import_column_family_job.cc +49 -45
  74. package/vendor/librocksdb/vendor/rocksdb/db/internal_stats.cc +60 -1
  75. package/vendor/librocksdb/vendor/rocksdb/db/internal_stats.h +20 -1
  76. package/vendor/librocksdb/vendor/rocksdb/db/log_reader.cc +15 -6
  77. package/vendor/librocksdb/vendor/rocksdb/db/log_writer.cc +59 -10
  78. package/vendor/librocksdb/vendor/rocksdb/db/log_writer.h +8 -0
  79. package/vendor/librocksdb/vendor/rocksdb/db/memtable.cc +24 -40
  80. package/vendor/librocksdb/vendor/rocksdb/db/memtable.h +10 -10
  81. package/vendor/librocksdb/vendor/rocksdb/db/memtable_list.cc +9 -8
  82. package/vendor/librocksdb/vendor/rocksdb/db/multi_cf_iterator_impl.h +296 -0
  83. package/vendor/librocksdb/vendor/rocksdb/db/range_tombstone_fragmenter.h +8 -10
  84. package/vendor/librocksdb/vendor/rocksdb/db/repair.cc +4 -3
  85. package/vendor/librocksdb/vendor/rocksdb/db/seqno_to_time_mapping.cc +30 -0
  86. package/vendor/librocksdb/vendor/rocksdb/db/seqno_to_time_mapping.h +9 -0
  87. package/vendor/librocksdb/vendor/rocksdb/db/table_cache.cc +17 -2
  88. package/vendor/librocksdb/vendor/rocksdb/db/table_cache.h +9 -1
  89. package/vendor/librocksdb/vendor/rocksdb/db/table_properties_collector.h +9 -2
  90. package/vendor/librocksdb/vendor/rocksdb/db/transaction_log_impl.cc +3 -3
  91. package/vendor/librocksdb/vendor/rocksdb/db/transaction_log_impl.h +7 -7
  92. package/vendor/librocksdb/vendor/rocksdb/db/version_edit.cc +0 -1
  93. package/vendor/librocksdb/vendor/rocksdb/db/version_edit_handler.cc +39 -5
  94. package/vendor/librocksdb/vendor/rocksdb/db/version_edit_handler.h +24 -15
  95. package/vendor/librocksdb/vendor/rocksdb/db/version_set.cc +117 -64
  96. package/vendor/librocksdb/vendor/rocksdb/db/version_set.h +27 -10
  97. package/vendor/librocksdb/vendor/rocksdb/db/wal_manager.cc +37 -29
  98. package/vendor/librocksdb/vendor/rocksdb/db/wal_manager.h +6 -5
  99. package/vendor/librocksdb/vendor/rocksdb/db/wide/wide_columns.cc +2 -3
  100. package/vendor/librocksdb/vendor/rocksdb/db/wide/wide_columns_helper.cc +6 -0
  101. package/vendor/librocksdb/vendor/rocksdb/db/write_batch.cc +89 -31
  102. package/vendor/librocksdb/vendor/rocksdb/db/write_thread.cc +53 -5
  103. package/vendor/librocksdb/vendor/rocksdb/db/write_thread.h +36 -4
  104. package/vendor/librocksdb/vendor/rocksdb/env/composite_env_wrapper.h +21 -0
  105. package/vendor/librocksdb/vendor/rocksdb/env/env.cc +15 -0
  106. package/vendor/librocksdb/vendor/rocksdb/env/fs_on_demand.cc +331 -0
  107. package/vendor/librocksdb/vendor/rocksdb/env/fs_on_demand.h +139 -0
  108. package/vendor/librocksdb/vendor/rocksdb/env/io_posix.cc +8 -6
  109. package/vendor/librocksdb/vendor/rocksdb/env/io_posix.h +1 -1
  110. package/vendor/librocksdb/vendor/rocksdb/file/delete_scheduler.cc +130 -27
  111. package/vendor/librocksdb/vendor/rocksdb/file/delete_scheduler.h +61 -8
  112. package/vendor/librocksdb/vendor/rocksdb/file/file_util.cc +25 -4
  113. package/vendor/librocksdb/vendor/rocksdb/file/file_util.h +15 -0
  114. package/vendor/librocksdb/vendor/rocksdb/file/sequence_file_reader.cc +1 -0
  115. package/vendor/librocksdb/vendor/rocksdb/file/sequence_file_reader.h +9 -4
  116. package/vendor/librocksdb/vendor/rocksdb/file/sst_file_manager_impl.cc +18 -0
  117. package/vendor/librocksdb/vendor/rocksdb/file/sst_file_manager_impl.h +31 -4
  118. package/vendor/librocksdb/vendor/rocksdb/file/writable_file_writer.cc +40 -38
  119. package/vendor/librocksdb/vendor/rocksdb/file/writable_file_writer.h +48 -15
  120. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/advanced_options.h +12 -3
  121. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/attribute_groups.h +114 -0
  122. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/c.h +90 -0
  123. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/cache.h +5 -0
  124. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/comparator.h +27 -0
  125. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/db.h +71 -12
  126. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/env.h +9 -0
  127. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/experimental.h +5 -0
  128. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/file_system.h +14 -0
  129. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/iterator.h +9 -71
  130. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/iterator_base.h +90 -0
  131. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/listener.h +21 -0
  132. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/options.h +125 -12
  133. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/perf_context.h +1 -1
  134. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/sst_file_reader.h +11 -1
  135. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/table.h +6 -6
  136. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/table_properties.h +19 -0
  137. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/transaction_log.h +12 -6
  138. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/types.h +12 -0
  139. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/universal_compaction.h +31 -0
  140. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/user_write_callback.h +29 -0
  141. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/cache_dump_load.h +4 -0
  142. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/checkpoint.h +4 -2
  143. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/customizable_util.h +0 -1
  144. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/env_mirror.h +1 -1
  145. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/ldb_cmd.h +24 -7
  146. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/option_change_migration.h +4 -4
  147. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/stackable_db.h +24 -5
  148. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +46 -0
  149. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/transaction.h +42 -17
  150. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/transaction_db.h +5 -0
  151. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/types_util.h +36 -0
  152. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +71 -3
  153. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/version.h +2 -2
  154. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/wide_columns.h +87 -72
  155. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/write_batch_base.h +1 -1
  156. package/vendor/librocksdb/vendor/rocksdb/memory/memory_allocator.cc +1 -0
  157. package/vendor/librocksdb/vendor/rocksdb/options/cf_options.cc +13 -2
  158. package/vendor/librocksdb/vendor/rocksdb/options/cf_options.h +6 -2
  159. package/vendor/librocksdb/vendor/rocksdb/options/db_options.cc +27 -1
  160. package/vendor/librocksdb/vendor/rocksdb/options/db_options.h +10 -3
  161. package/vendor/librocksdb/vendor/rocksdb/options/options.cc +3 -0
  162. package/vendor/librocksdb/vendor/rocksdb/options/options_helper.cc +1 -0
  163. package/vendor/librocksdb/vendor/rocksdb/port/jemalloc_helper.h +2 -2
  164. package/vendor/librocksdb/vendor/rocksdb/port/stack_trace.cc +1 -0
  165. package/vendor/librocksdb/vendor/rocksdb/port/win/port_win.cc +3 -2
  166. package/vendor/librocksdb/vendor/rocksdb/table/block_based/binary_search_index_reader.cc +1 -2
  167. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_builder.cc +47 -31
  168. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_factory.cc +15 -0
  169. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_iterator.cc +37 -18
  170. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_iterator.h +10 -3
  171. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_reader.cc +102 -41
  172. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_reader.h +15 -7
  173. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_reader_impl.h +1 -3
  174. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -6
  175. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_cache.h +31 -0
  176. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_prefetcher.cc +6 -0
  177. package/vendor/librocksdb/vendor/rocksdb/table/block_based/cachable_entry.h +10 -5
  178. package/vendor/librocksdb/vendor/rocksdb/table/block_based/filter_block.h +34 -28
  179. package/vendor/librocksdb/vendor/rocksdb/table/block_based/filter_block_reader_common.cc +17 -11
  180. package/vendor/librocksdb/vendor/rocksdb/table/block_based/filter_block_reader_common.h +5 -2
  181. package/vendor/librocksdb/vendor/rocksdb/table/block_based/filter_policy.cc +12 -3
  182. package/vendor/librocksdb/vendor/rocksdb/table/block_based/full_filter_block.cc +37 -30
  183. package/vendor/librocksdb/vendor/rocksdb/table/block_based/full_filter_block.h +11 -13
  184. package/vendor/librocksdb/vendor/rocksdb/table/block_based/hash_index_reader.cc +1 -2
  185. package/vendor/librocksdb/vendor/rocksdb/table/block_based/index_builder.cc +62 -53
  186. package/vendor/librocksdb/vendor/rocksdb/table/block_based/index_builder.h +60 -38
  187. package/vendor/librocksdb/vendor/rocksdb/table/block_based/index_reader_common.cc +14 -9
  188. package/vendor/librocksdb/vendor/rocksdb/table/block_based/index_reader_common.h +4 -1
  189. package/vendor/librocksdb/vendor/rocksdb/table/block_based/partitioned_filter_block.cc +135 -94
  190. package/vendor/librocksdb/vendor/rocksdb/table/block_based/partitioned_filter_block.h +52 -46
  191. package/vendor/librocksdb/vendor/rocksdb/table/block_based/partitioned_index_reader.cc +51 -13
  192. package/vendor/librocksdb/vendor/rocksdb/table/block_based/partitioned_index_reader.h +2 -0
  193. package/vendor/librocksdb/vendor/rocksdb/table/block_based/uncompression_dict_reader.cc +3 -11
  194. package/vendor/librocksdb/vendor/rocksdb/table/block_based/uncompression_dict_reader.h +2 -3
  195. package/vendor/librocksdb/vendor/rocksdb/table/block_fetcher.cc +8 -10
  196. package/vendor/librocksdb/vendor/rocksdb/table/block_fetcher.h +2 -1
  197. package/vendor/librocksdb/vendor/rocksdb/table/compaction_merging_iterator.cc +9 -10
  198. package/vendor/librocksdb/vendor/rocksdb/table/compaction_merging_iterator.h +3 -2
  199. package/vendor/librocksdb/vendor/rocksdb/table/format.cc +1 -2
  200. package/vendor/librocksdb/vendor/rocksdb/table/iterator.cc +4 -0
  201. package/vendor/librocksdb/vendor/rocksdb/table/merging_iterator.cc +18 -13
  202. package/vendor/librocksdb/vendor/rocksdb/table/merging_iterator.h +5 -3
  203. package/vendor/librocksdb/vendor/rocksdb/table/meta_blocks.cc +18 -4
  204. package/vendor/librocksdb/vendor/rocksdb/table/meta_blocks.h +4 -0
  205. package/vendor/librocksdb/vendor/rocksdb/table/plain/plain_table_builder.cc +2 -2
  206. package/vendor/librocksdb/vendor/rocksdb/table/sst_file_dumper.cc +6 -6
  207. package/vendor/librocksdb/vendor/rocksdb/table/sst_file_reader.cc +24 -2
  208. package/vendor/librocksdb/vendor/rocksdb/table/sst_file_writer_collectors.h +3 -1
  209. package/vendor/librocksdb/vendor/rocksdb/table/table_builder.h +8 -7
  210. package/vendor/librocksdb/vendor/rocksdb/table/table_iterator.h +69 -0
  211. package/vendor/librocksdb/vendor/rocksdb/table/table_reader.h +9 -0
  212. package/vendor/librocksdb/vendor/rocksdb/test_util/testutil.cc +25 -0
  213. package/vendor/librocksdb/vendor/rocksdb/test_util/testutil.h +12 -0
  214. package/vendor/librocksdb/vendor/rocksdb/tools/db_bench_tool.cc +32 -0
  215. package/vendor/librocksdb/vendor/rocksdb/tools/ldb_cmd.cc +618 -124
  216. package/vendor/librocksdb/vendor/rocksdb/tools/ldb_cmd_impl.h +19 -1
  217. package/vendor/librocksdb/vendor/rocksdb/tools/ldb_tool.cc +9 -0
  218. package/vendor/librocksdb/vendor/rocksdb/util/aligned_storage.h +24 -0
  219. package/vendor/librocksdb/vendor/rocksdb/util/autovector.h +4 -0
  220. package/vendor/librocksdb/vendor/rocksdb/util/comparator.cc +12 -0
  221. package/vendor/librocksdb/vendor/rocksdb/util/filter_bench.cc +1 -1
  222. package/vendor/librocksdb/vendor/rocksdb/util/random.cc +2 -1
  223. package/vendor/librocksdb/vendor/rocksdb/util/stderr_logger.cc +3 -4
  224. package/vendor/librocksdb/vendor/rocksdb/util/stderr_logger.h +1 -1
  225. package/vendor/librocksdb/vendor/rocksdb/util/udt_util.cc +33 -0
  226. package/vendor/librocksdb/vendor/rocksdb/util/udt_util.h +7 -0
  227. package/vendor/librocksdb/vendor/rocksdb/util/write_batch_util.h +5 -0
  228. package/vendor/librocksdb/vendor/rocksdb/util/xxhash.h +36 -29
  229. package/vendor/librocksdb/vendor/rocksdb/utilities/blob_db/blob_db_impl.h +3 -0
  230. package/vendor/librocksdb/vendor/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +20 -0
  231. package/vendor/librocksdb/vendor/rocksdb/utilities/cache_dump_load_impl.cc +29 -9
  232. package/vendor/librocksdb/vendor/rocksdb/utilities/cache_dump_load_impl.h +14 -3
  233. package/vendor/librocksdb/vendor/rocksdb/utilities/debug.cc +16 -4
  234. package/vendor/librocksdb/vendor/rocksdb/utilities/fault_injection_fs.cc +677 -248
  235. package/vendor/librocksdb/vendor/rocksdb/utilities/fault_injection_fs.h +325 -158
  236. package/vendor/librocksdb/vendor/rocksdb/utilities/option_change_migration/option_change_migration.cc +1 -8
  237. package/vendor/librocksdb/vendor/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +144 -0
  238. package/vendor/librocksdb/vendor/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +45 -0
  239. package/vendor/librocksdb/vendor/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +12 -0
  240. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +1 -1
  241. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h +3 -3
  242. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/pessimistic_transaction.cc +116 -20
  243. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/pessimistic_transaction.h +33 -1
  244. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +78 -13
  245. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/pessimistic_transaction_db.h +33 -1
  246. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_base.cc +106 -7
  247. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_base.h +68 -10
  248. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_test.h +7 -3
  249. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_util.cc +8 -5
  250. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_util.h +7 -4
  251. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -12
  252. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_prepared_txn_db.cc +4 -4
  253. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_prepared_txn_db.h +17 -0
  254. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_unprepared_txn.cc +11 -9
  255. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +2 -1
  256. package/vendor/librocksdb/vendor/rocksdb/utilities/types_util.cc +88 -0
  257. package/vendor/librocksdb/vendor/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +313 -14
  258. package/vendor/librocksdb/vendor/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +7 -0
  259. package/vendor/librocksdb/vendor/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +1 -1
  260. package/vendor/librocksdb/vendor/rocksdb/db/multi_cf_iterator.cc +0 -102
  261. package/vendor/librocksdb/vendor/rocksdb/db/multi_cf_iterator.h +0 -159
@@ -17,6 +17,7 @@
17
17
  #include <cstdio>
18
18
  #include <map>
19
19
  #include <memory>
20
+ #include <optional>
20
21
  #include <set>
21
22
  #include <sstream>
22
23
  #include <stdexcept>
@@ -26,7 +27,9 @@
26
27
  #include <vector>
27
28
 
28
29
  #include "db/arena_wrapped_db_iter.h"
30
+ #include "db/attribute_group_iterator_impl.h"
29
31
  #include "db/builder.h"
32
+ #include "db/coalescing_iterator.h"
30
33
  #include "db/compaction/compaction_job.h"
31
34
  #include "db/convenience_impl.h"
32
35
  #include "db/db_info_dumper.h"
@@ -45,7 +48,6 @@
45
48
  #include "db/memtable.h"
46
49
  #include "db/memtable_list.h"
47
50
  #include "db/merge_context.h"
48
- #include "db/multi_cf_iterator.h"
49
51
  #include "db/periodic_task_scheduler.h"
50
52
  #include "db/range_tombstone_fragmenter.h"
51
53
  #include "db/table_cache.h"
@@ -390,8 +392,8 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
390
392
  if (!s.ok()) {
391
393
  io_s = versions_->io_status();
392
394
  if (!io_s.ok()) {
393
- s = error_handler_.SetBGError(io_s,
394
- BackgroundErrorReason::kManifestWrite);
395
+ error_handler_.SetBGError(io_s,
396
+ BackgroundErrorReason::kManifestWrite);
395
397
  }
396
398
  }
397
399
  }
@@ -636,7 +638,7 @@ Status DBImpl::CloseHelper() {
636
638
  if (!s.ok()) {
637
639
  ROCKS_LOG_WARN(
638
640
  immutable_db_options_.info_log,
639
- "Unable to Sync WAL file %s with error -- %s",
641
+ "Unable to clear writer for WAL %s with error -- %s",
640
642
  LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
641
643
  s.ToString().c_str());
642
644
  // Retain the first error
@@ -915,8 +917,8 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
915
917
  read_options, write_options, &edit, &mutex_,
916
918
  directories_.GetDbDir());
917
919
  if (!s.ok() && versions_->io_status().IsIOError()) {
918
- s = error_handler_.SetBGError(versions_->io_status(),
919
- BackgroundErrorReason::kManifestWrite);
920
+ error_handler_.SetBGError(versions_->io_status(),
921
+ BackgroundErrorReason::kManifestWrite);
920
922
  }
921
923
  }
922
924
 
@@ -1548,64 +1550,132 @@ bool DBImpl::WALBufferIsEmpty() {
1548
1550
  return res;
1549
1551
  }
1550
1552
 
1553
+ Status DBImpl::GetOpenWalSizes(std::map<uint64_t, uint64_t>& number_to_size) {
1554
+ assert(number_to_size.empty());
1555
+ InstrumentedMutexLock l(&log_write_mutex_);
1556
+ for (auto& log : logs_) {
1557
+ auto* open_file = log.writer->file();
1558
+ if (open_file) {
1559
+ number_to_size[log.number] = open_file->GetFlushedSize();
1560
+ }
1561
+ }
1562
+ return Status::OK();
1563
+ }
1564
+
1551
1565
  Status DBImpl::SyncWAL() {
1552
1566
  TEST_SYNC_POINT("DBImpl::SyncWAL:Begin");
1553
- autovector<log::Writer*, 1> logs_to_sync;
1554
- bool need_log_dir_sync;
1555
- uint64_t current_log_number;
1567
+ WriteOptions write_options;
1568
+ VersionEdit synced_wals;
1569
+ Status s = SyncWalImpl(/*include_current_wal=*/true, write_options,
1570
+ /*job_context=*/nullptr, &synced_wals,
1571
+ /*error_recovery_in_prog=*/false);
1572
+
1573
+ if (s.ok() && synced_wals.IsWalAddition()) {
1574
+ InstrumentedMutexLock l(&mutex_);
1575
+ // TODO: plumb Env::IOActivity, Env::IOPriority
1576
+ const ReadOptions read_options;
1577
+ s = ApplyWALToManifest(read_options, write_options, &synced_wals);
1578
+ }
1579
+
1580
+ TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
1581
+ return s;
1582
+ }
1583
+
1584
+ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
1585
+ const WriteOptions& write_options,
1586
+ JobContext* job_context, VersionEdit* synced_wals,
1587
+ bool error_recovery_in_prog) {
1588
+ autovector<log::Writer*, 1> wals_to_sync;
1589
+ bool need_wal_dir_sync;
1590
+ // Number of a WAL that was active at the start of call and maybe is by
1591
+ // the end of the call.
1592
+ uint64_t maybe_active_number;
1593
+ // Sync WALs up to this number
1594
+ uint64_t up_to_number;
1556
1595
 
1557
1596
  {
1558
1597
  InstrumentedMutexLock l(&log_write_mutex_);
1559
1598
  assert(!logs_.empty());
1560
1599
 
1561
- // This SyncWAL() call only cares about logs up to this number.
1562
- current_log_number = logfile_number_;
1600
+ maybe_active_number = logfile_number_;
1601
+ up_to_number =
1602
+ include_current_wal ? maybe_active_number : maybe_active_number - 1;
1563
1603
 
1564
- while (logs_.front().number <= current_log_number &&
1565
- logs_.front().IsSyncing()) {
1604
+ while (logs_.front().number <= up_to_number && logs_.front().IsSyncing()) {
1566
1605
  log_sync_cv_.Wait();
1567
1606
  }
1568
1607
  // First check that logs are safe to sync in background.
1569
- for (auto it = logs_.begin();
1570
- it != logs_.end() && it->number <= current_log_number; ++it) {
1571
- if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
1572
- return Status::NotSupported(
1573
- "SyncWAL() is not supported for this implementation of WAL file",
1574
- immutable_db_options_.allow_mmap_writes
1575
- ? "try setting Options::allow_mmap_writes to false"
1576
- : Slice());
1577
- }
1608
+ if (include_current_wal &&
1609
+ !logs_.back().writer->file()->writable_file()->IsSyncThreadSafe()) {
1610
+ return IOStatus::NotSupported(
1611
+ "SyncWAL() is not supported for this implementation of WAL file",
1612
+ immutable_db_options_.allow_mmap_writes
1613
+ ? "try setting Options::allow_mmap_writes to false"
1614
+ : Slice());
1578
1615
  }
1579
1616
  for (auto it = logs_.begin();
1580
- it != logs_.end() && it->number <= current_log_number; ++it) {
1617
+ it != logs_.end() && it->number <= up_to_number; ++it) {
1581
1618
  auto& log = *it;
1619
+ // Ensure the head of logs_ is marked as getting_synced if any is.
1582
1620
  log.PrepareForSync();
1583
- logs_to_sync.push_back(log.writer);
1621
+ // If last sync failed on a later WAL, this could be a fully synced
1622
+ // and closed WAL that just needs to be recorded as synced in the
1623
+ // manifest.
1624
+ if (log.writer->file()) {
1625
+ wals_to_sync.push_back(log.writer);
1626
+ }
1584
1627
  }
1585
1628
 
1586
- need_log_dir_sync = !log_dir_synced_;
1629
+ need_wal_dir_sync = !log_dir_synced_;
1587
1630
  }
1588
1631
 
1589
- TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
1632
+ if (include_current_wal) {
1633
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
1634
+ }
1590
1635
  RecordTick(stats_, WAL_FILE_SYNCED);
1591
- Status status;
1592
- IOStatus io_s;
1593
- // TODO: plumb Env::IOActivity, Env::IOPriority
1594
- const ReadOptions read_options;
1595
- const WriteOptions write_options;
1596
1636
  IOOptions opts;
1597
- io_s = WritableFileWriter::PrepareIOOptions(write_options, opts);
1598
- if (!io_s.ok()) {
1599
- status = io_s;
1600
- }
1637
+ IOStatus io_s = WritableFileWriter::PrepareIOOptions(write_options, opts);
1638
+ std::list<log::Writer*> wals_internally_closed;
1601
1639
  if (io_s.ok()) {
1602
- for (log::Writer* log : logs_to_sync) {
1603
- io_s =
1604
- log->file()->SyncWithoutFlush(opts, immutable_db_options_.use_fsync);
1640
+ for (log::Writer* log : wals_to_sync) {
1641
+ if (job_context) {
1642
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
1643
+ "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
1644
+ log->get_log_number());
1645
+ }
1646
+ if (error_recovery_in_prog) {
1647
+ log->file()->reset_seen_error();
1648
+ }
1649
+ if (log->get_log_number() >= maybe_active_number) {
1650
+ assert(log->get_log_number() == maybe_active_number);
1651
+ io_s = log->file()->SyncWithoutFlush(opts,
1652
+ immutable_db_options_.use_fsync);
1653
+ } else {
1654
+ io_s = log->file()->Sync(opts, immutable_db_options_.use_fsync);
1655
+ }
1605
1656
  if (!io_s.ok()) {
1606
- status = io_s;
1607
1657
  break;
1608
1658
  }
1659
+ // WALs can be closed when purging obsolete files, but if recycling is
1660
+ // enabled, the log file is closed here so that it can be reused. And
1661
+ // immediate closure here upon final sync makes it easier to guarantee
1662
+ // that Checkpoint doesn't LinkFile on a WAL still open for write, which
1663
+ // might be unsupported for some FileSystem implementations. Close here
1664
+ // should be inexpensive because flush and sync are done, so the kill
1665
+ // switch background_close_inactive_wals is expected to be removed in
1666
+ // the future.
1667
+ if (log->get_log_number() < maybe_active_number &&
1668
+ (immutable_db_options_.recycle_log_file_num > 0 ||
1669
+ !immutable_db_options_.background_close_inactive_wals)) {
1670
+ if (error_recovery_in_prog) {
1671
+ log->file()->reset_seen_error();
1672
+ }
1673
+ io_s = log->file()->Close(opts);
1674
+ wals_internally_closed.push_back(log);
1675
+ if (!io_s.ok()) {
1676
+ break;
1677
+ }
1678
+ }
1609
1679
  }
1610
1680
  }
1611
1681
  if (!io_s.ok()) {
@@ -1615,31 +1685,34 @@ Status DBImpl::SyncWAL() {
1615
1685
  // future writes
1616
1686
  IOStatusCheck(io_s);
1617
1687
  }
1618
- if (status.ok() && need_log_dir_sync) {
1619
- status = directories_.GetWalDir()->FsyncWithDirOptions(
1688
+ if (io_s.ok() && need_wal_dir_sync) {
1689
+ io_s = directories_.GetWalDir()->FsyncWithDirOptions(
1620
1690
  IOOptions(), nullptr,
1621
1691
  DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
1622
1692
  }
1623
- TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
1693
+ if (include_current_wal) {
1694
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
1624
1695
 
1625
- TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
1626
- VersionEdit synced_wals;
1696
+ TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
1697
+ } else {
1698
+ TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedWals:BeforeReLock",
1699
+ /*arg=*/nullptr);
1700
+ }
1627
1701
  {
1628
1702
  InstrumentedMutexLock l(&log_write_mutex_);
1629
- if (status.ok()) {
1630
- MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals);
1703
+ for (auto* wal : wals_internally_closed) {
1704
+ // We can only modify the state of log::Writer under the mutex
1705
+ bool was_closed = wal->PublishIfClosed();
1706
+ assert(was_closed);
1707
+ (void)was_closed;
1708
+ }
1709
+ if (io_s.ok()) {
1710
+ MarkLogsSynced(up_to_number, need_wal_dir_sync, synced_wals);
1631
1711
  } else {
1632
- MarkLogsNotSynced(current_log_number);
1712
+ MarkLogsNotSynced(up_to_number);
1633
1713
  }
1634
1714
  }
1635
- if (status.ok() && synced_wals.IsWalAddition()) {
1636
- InstrumentedMutexLock l(&mutex_);
1637
- status = ApplyWALToManifest(read_options, write_options, &synced_wals);
1638
- }
1639
-
1640
- TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
1641
-
1642
- return status;
1715
+ return io_s;
1643
1716
  }
1644
1717
 
1645
1718
  Status DBImpl::ApplyWALToManifest(const ReadOptions& read_options,
@@ -1652,8 +1725,8 @@ Status DBImpl::ApplyWALToManifest(const ReadOptions& read_options,
1652
1725
  read_options, write_options, synced_wals, &mutex_,
1653
1726
  directories_.GetDbDir());
1654
1727
  if (!status.ok() && versions_->io_status().IsIOError()) {
1655
- status = error_handler_.SetBGError(versions_->io_status(),
1656
- BackgroundErrorReason::kManifestWrite);
1728
+ error_handler_.SetBGError(versions_->io_status(),
1729
+ BackgroundErrorReason::kManifestWrite);
1657
1730
  }
1658
1731
  return status;
1659
1732
  }
@@ -1757,16 +1830,19 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
1757
1830
  wal.GetPreSyncSize() > 0) {
1758
1831
  synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
1759
1832
  }
1760
- // Check if the file has been closed, i.e wal.writer->file() == nullptr
1761
- // which can happen if log recycling is enabled, or if all the data in
1762
- // the log has been synced
1833
+ // Reclaim closed WALs (wal.writer->file() == nullptr), and if we don't
1834
+ // need to close before that (background_close_inactive_wals) we can
1835
+ // opportunistically reclaim WALs that happen to be fully synced.
1836
+ // (Probably not worth extra code and mutex release to opportunistically
1837
+ // close WALs that became eligible since last holding the mutex.
1838
+ // FindObsoleteFiles can take care of it.)
1763
1839
  if (wal.writer->file() == nullptr ||
1764
- wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) {
1840
+ (immutable_db_options_.background_close_inactive_wals &&
1841
+ wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize())) {
1765
1842
  // Fully synced
1766
1843
  logs_to_free_.push_back(wal.ReleaseWriter());
1767
1844
  it = logs_.erase(it);
1768
1845
  } else {
1769
- assert(wal.GetPreSyncSize() < wal.writer->file()->GetFlushedSize());
1770
1846
  wal.FinishSync();
1771
1847
  ++it;
1772
1848
  }
@@ -1853,6 +1929,7 @@ void DBImpl::SchedulePurge() {
1853
1929
  }
1854
1930
 
1855
1931
  void DBImpl::BackgroundCallPurge() {
1932
+ TEST_SYNC_POINT("DBImpl::BackgroundCallPurge:beforeMutexLock");
1856
1933
  mutex_.Lock();
1857
1934
 
1858
1935
  while (!logs_to_free_queue_.empty()) {
@@ -1988,19 +2065,19 @@ InternalIterator* DBImpl::NewInternalIterator(
1988
2065
  read_options, super_version->GetSeqnoToTimeMapping(), arena);
1989
2066
  Status s;
1990
2067
  if (!read_options.ignore_range_deletions) {
1991
- TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
2068
+ std::unique_ptr<TruncatedRangeDelIterator> mem_tombstone_iter;
1992
2069
  auto range_del_iter = super_version->mem->NewRangeTombstoneIterator(
1993
2070
  read_options, sequence, false /* immutable_memtable */);
1994
2071
  if (range_del_iter == nullptr || range_del_iter->empty()) {
1995
2072
  delete range_del_iter;
1996
2073
  } else {
1997
- mem_tombstone_iter = new TruncatedRangeDelIterator(
2074
+ mem_tombstone_iter = std::make_unique<TruncatedRangeDelIterator>(
1998
2075
  std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
1999
2076
  &cfd->ioptions()->internal_comparator, nullptr /* smallest */,
2000
2077
  nullptr /* largest */);
2001
2078
  }
2002
- merge_iter_builder.AddPointAndTombstoneIterator(mem_iter,
2003
- mem_tombstone_iter);
2079
+ merge_iter_builder.AddPointAndTombstoneIterator(
2080
+ mem_iter, std::move(mem_tombstone_iter));
2004
2081
  } else {
2005
2082
  merge_iter_builder.AddIterator(mem_iter);
2006
2083
  }
@@ -2097,7 +2174,7 @@ Status DBImpl::GetEntity(const ReadOptions& _read_options,
2097
2174
  if (_read_options.io_activity != Env::IOActivity::kUnknown &&
2098
2175
  _read_options.io_activity != Env::IOActivity::kGetEntity) {
2099
2176
  return Status::InvalidArgument(
2100
- "Cannot call GetEntity with `ReadOptions::io_activity` != "
2177
+ "Can only call GetEntity with `ReadOptions::io_activity` set to "
2101
2178
  "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGetEntity`");
2102
2179
  }
2103
2180
  ReadOptions read_options(_read_options);
@@ -2124,7 +2201,7 @@ Status DBImpl::GetEntity(const ReadOptions& _read_options, const Slice& key,
2124
2201
  if (_read_options.io_activity != Env::IOActivity::kUnknown &&
2125
2202
  _read_options.io_activity != Env::IOActivity::kGetEntity) {
2126
2203
  s = Status::InvalidArgument(
2127
- "Cannot call GetEntity with `ReadOptions::io_activity` != "
2204
+ "Can only call GetEntity with `ReadOptions::io_activity` set to "
2128
2205
  "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGetEntity`");
2129
2206
  for (size_t i = 0; i < num_column_families; ++i) {
2130
2207
  (*result)[i].SetStatus(s);
@@ -2515,12 +2592,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
2515
2592
  return s;
2516
2593
  }
2517
2594
 
2518
- template <class T>
2519
- Status DBImpl::MultiCFSnapshot(
2520
- const ReadOptions& read_options, ReadCallback* callback,
2521
- std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
2522
- iter_deref_func,
2523
- T* cf_list, SequenceNumber* snapshot, bool* sv_from_thread_local) {
2595
+ template <class T, typename IterDerefFuncType>
2596
+ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
2597
+ ReadCallback* callback,
2598
+ IterDerefFuncType iter_deref_func, T* cf_list,
2599
+ bool extra_sv_ref, SequenceNumber* snapshot,
2600
+ bool* sv_from_thread_local) {
2524
2601
  PERF_TIMER_GUARD(get_snapshot_time);
2525
2602
 
2526
2603
  assert(sv_from_thread_local);
@@ -2537,7 +2614,7 @@ Status DBImpl::MultiCFSnapshot(
2537
2614
  SuperVersion* super_version = node->super_version;
2538
2615
  ColumnFamilyData* cfd = node->cfd;
2539
2616
  if (super_version != nullptr) {
2540
- if (*sv_from_thread_local) {
2617
+ if (*sv_from_thread_local && !extra_sv_ref) {
2541
2618
  ReturnAndCleanupSuperVersion(cfd, super_version);
2542
2619
  } else {
2543
2620
  CleanupSuperVersion(super_version);
@@ -2553,7 +2630,11 @@ Status DBImpl::MultiCFSnapshot(
2553
2630
  // super version
2554
2631
  auto cf_iter = cf_list->begin();
2555
2632
  auto node = iter_deref_func(cf_iter);
2556
- node->super_version = GetAndRefSuperVersion(node->cfd);
2633
+ if (extra_sv_ref) {
2634
+ node->super_version = node->cfd->GetReferencedSuperVersion(this);
2635
+ } else {
2636
+ node->super_version = GetAndRefSuperVersion(node->cfd);
2637
+ }
2557
2638
  if (check_read_ts) {
2558
2639
  s = FailIfReadCollapsedHistory(node->cfd, node->super_version,
2559
2640
  *(read_options.timestamp));
@@ -2600,7 +2681,7 @@ Status DBImpl::MultiCFSnapshot(
2600
2681
  }
2601
2682
  if (read_options.snapshot == nullptr) {
2602
2683
  if (last_try) {
2603
- TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
2684
+ TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
2604
2685
  // We're close to max number of retries. For the last retry,
2605
2686
  // acquire the lock so we're sure to succeed
2606
2687
  mutex_.Lock();
@@ -2615,11 +2696,15 @@ Status DBImpl::MultiCFSnapshot(
2615
2696
  ++cf_iter) {
2616
2697
  auto node = iter_deref_func(cf_iter);
2617
2698
  if (!last_try) {
2618
- node->super_version = GetAndRefSuperVersion(node->cfd);
2699
+ if (extra_sv_ref) {
2700
+ node->super_version = node->cfd->GetReferencedSuperVersion(this);
2701
+ } else {
2702
+ node->super_version = GetAndRefSuperVersion(node->cfd);
2703
+ }
2619
2704
  } else {
2620
2705
  node->super_version = node->cfd->GetSuperVersion()->Ref();
2621
2706
  }
2622
- TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
2707
+ TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::AfterRefSV");
2623
2708
  if (check_read_ts) {
2624
2709
  s = FailIfReadCollapsedHistory(node->cfd, node->super_version,
2625
2710
  *(read_options.timestamp));
@@ -2633,6 +2718,7 @@ Status DBImpl::MultiCFSnapshot(
2633
2718
  break;
2634
2719
  }
2635
2720
  }
2721
+ TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::BeforeCheckingSnapshot");
2636
2722
  if (read_options.snapshot != nullptr || last_try) {
2637
2723
  // If user passed a snapshot, then we don't care if a memtable is
2638
2724
  // sealed or compaction happens because the snapshot would ensure
@@ -2656,7 +2742,7 @@ Status DBImpl::MultiCFSnapshot(
2656
2742
  if (!retry) {
2657
2743
  if (last_try) {
2658
2744
  mutex_.Unlock();
2659
- TEST_SYNC_POINT("DBImpl::MultiGet::AfterLastTryRefSV");
2745
+ TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::AfterLastTryRefSV");
2660
2746
  }
2661
2747
  break;
2662
2748
  }
@@ -2768,37 +2854,37 @@ void DBImpl::MultiGetCommon(const ReadOptions& read_options,
2768
2854
  }
2769
2855
  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
2770
2856
 
2771
- autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
2772
- multiget_cf_data;
2857
+ autovector<MultiGetKeyRangePerCf, MultiGetContext::MAX_BATCH_SIZE>
2858
+ key_range_per_cf;
2859
+ autovector<ColumnFamilySuperVersionPair, MultiGetContext::MAX_BATCH_SIZE>
2860
+ cf_sv_pairs;
2773
2861
  size_t cf_start = 0;
2774
2862
  ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
2775
2863
 
2776
2864
  for (size_t i = 0; i < num_keys; ++i) {
2777
2865
  KeyContext* key_ctx = sorted_keys[i];
2778
2866
  if (key_ctx->column_family != cf) {
2779
- multiget_cf_data.emplace_back(cf, cf_start, i - cf_start, nullptr);
2867
+ key_range_per_cf.emplace_back(cf_start, i - cf_start);
2868
+ cf_sv_pairs.emplace_back(cf, nullptr);
2780
2869
  cf_start = i;
2781
2870
  cf = key_ctx->column_family;
2782
2871
  }
2783
2872
  }
2784
2873
 
2785
- multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
2786
-
2787
- std::function<MultiGetColumnFamilyData*(
2788
- autovector<MultiGetColumnFamilyData,
2789
- MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
2790
- iter_deref_lambda =
2791
- [](autovector<MultiGetColumnFamilyData,
2792
- MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
2793
- return &(*cf_iter);
2794
- };
2874
+ key_range_per_cf.emplace_back(cf_start, num_keys - cf_start);
2875
+ cf_sv_pairs.emplace_back(cf, nullptr);
2795
2876
 
2796
- SequenceNumber consistent_seqnum;
2797
- bool sv_from_thread_local;
2798
- Status s = MultiCFSnapshot<
2799
- autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
2800
- read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
2801
- &consistent_seqnum, &sv_from_thread_local);
2877
+ SequenceNumber consistent_seqnum = kMaxSequenceNumber;
2878
+ bool sv_from_thread_local = false;
2879
+ Status s = MultiCFSnapshot<autovector<ColumnFamilySuperVersionPair,
2880
+ MultiGetContext::MAX_BATCH_SIZE>>(
2881
+ read_options, nullptr,
2882
+ [](autovector<ColumnFamilySuperVersionPair,
2883
+ MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
2884
+ return &(*cf_iter);
2885
+ },
2886
+ &cf_sv_pairs,
2887
+ /* extra_sv_ref */ false, &consistent_seqnum, &sv_from_thread_local);
2802
2888
 
2803
2889
  if (!s.ok()) {
2804
2890
  for (size_t i = 0; i < num_keys; ++i) {
@@ -2816,31 +2902,40 @@ void DBImpl::MultiGetCommon(const ReadOptions& read_options,
2816
2902
  read_callback = &timestamp_read_callback;
2817
2903
  }
2818
2904
 
2819
- auto cf_iter = multiget_cf_data.begin();
2820
- for (; cf_iter != multiget_cf_data.end(); ++cf_iter) {
2821
- s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys,
2822
- &sorted_keys, cf_iter->super_version, consistent_seqnum,
2905
+ assert(key_range_per_cf.size() == cf_sv_pairs.size());
2906
+ auto key_range_per_cf_iter = key_range_per_cf.begin();
2907
+ auto cf_sv_pair_iter = cf_sv_pairs.begin();
2908
+ while (key_range_per_cf_iter != key_range_per_cf.end() &&
2909
+ cf_sv_pair_iter != cf_sv_pairs.end()) {
2910
+ s = MultiGetImpl(read_options, key_range_per_cf_iter->start,
2911
+ key_range_per_cf_iter->num_keys, &sorted_keys,
2912
+ cf_sv_pair_iter->super_version, consistent_seqnum,
2823
2913
  read_callback);
2824
2914
  if (!s.ok()) {
2825
2915
  break;
2826
2916
  }
2917
+ ++key_range_per_cf_iter;
2918
+ ++cf_sv_pair_iter;
2827
2919
  }
2828
2920
  if (!s.ok()) {
2829
2921
  assert(s.IsTimedOut() || s.IsAborted());
2830
- for (++cf_iter; cf_iter != multiget_cf_data.end(); ++cf_iter) {
2831
- for (size_t i = cf_iter->start; i < cf_iter->start + cf_iter->num_keys;
2922
+ for (++key_range_per_cf_iter;
2923
+ key_range_per_cf_iter != key_range_per_cf.end();
2924
+ ++key_range_per_cf_iter) {
2925
+ for (size_t i = key_range_per_cf_iter->start;
2926
+ i < key_range_per_cf_iter->start + key_range_per_cf_iter->num_keys;
2832
2927
  ++i) {
2833
2928
  *sorted_keys[i]->s = s;
2834
2929
  }
2835
2930
  }
2836
2931
  }
2837
2932
 
2838
- for (const auto& iter : multiget_cf_data) {
2933
+ for (const auto& cf_sv_pair : cf_sv_pairs) {
2839
2934
  if (sv_from_thread_local) {
2840
- ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version);
2935
+ ReturnAndCleanupSuperVersion(cf_sv_pair.cfd, cf_sv_pair.super_version);
2841
2936
  } else {
2842
- TEST_SYNC_POINT("DBImpl::MultiGet::BeforeLastTryUnRefSV");
2843
- CleanupSuperVersion(iter.super_version);
2937
+ TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::BeforeLastTryUnRefSV");
2938
+ CleanupSuperVersion(cf_sv_pair.super_version);
2844
2939
  }
2845
2940
  }
2846
2941
  }
@@ -2972,21 +3067,18 @@ void DBImpl::MultiGetWithCallbackImpl(
2972
3067
  const ReadOptions& read_options, ColumnFamilyHandle* column_family,
2973
3068
  ReadCallback* callback,
2974
3069
  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
2975
- std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
2976
- multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
2977
- std::function<MultiGetColumnFamilyData*(
2978
- std::array<MultiGetColumnFamilyData, 1>::iterator&)>
2979
- iter_deref_lambda =
2980
- [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
2981
- return &(*cf_iter);
2982
- };
2983
-
3070
+ std::array<ColumnFamilySuperVersionPair, 1> cf_sv_pairs;
3071
+ cf_sv_pairs[0] = ColumnFamilySuperVersionPair(column_family, nullptr);
2984
3072
  size_t num_keys = sorted_keys->size();
2985
- SequenceNumber consistent_seqnum;
2986
- bool sv_from_thread_local;
2987
- Status s = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
2988
- read_options, callback, iter_deref_lambda, &multiget_cf_data,
2989
- &consistent_seqnum, &sv_from_thread_local);
3073
+ SequenceNumber consistent_seqnum = kMaxSequenceNumber;
3074
+ bool sv_from_thread_local = false;
3075
+ Status s = MultiCFSnapshot<std::array<ColumnFamilySuperVersionPair, 1>>(
3076
+ read_options, callback,
3077
+ [](std::array<ColumnFamilySuperVersionPair, 1>::iterator& cf_iter) {
3078
+ return &(*cf_iter);
3079
+ },
3080
+ &cf_sv_pairs,
3081
+ /* extra_sv_ref */ false, &consistent_seqnum, &sv_from_thread_local);
2990
3082
  if (!s.ok()) {
2991
3083
  return;
2992
3084
  }
@@ -3025,11 +3117,11 @@ void DBImpl::MultiGetWithCallbackImpl(
3025
3117
  }
3026
3118
 
3027
3119
  s = MultiGetImpl(read_options, 0, num_keys, sorted_keys,
3028
- multiget_cf_data[0].super_version, consistent_seqnum,
3120
+ cf_sv_pairs[0].super_version, consistent_seqnum,
3029
3121
  read_callback);
3030
3122
  assert(s.ok() || s.IsTimedOut() || s.IsAborted());
3031
- ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
3032
- multiget_cf_data[0].super_version);
3123
+ ReturnAndCleanupSuperVersion(cf_sv_pairs[0].cfd,
3124
+ cf_sv_pairs[0].super_version);
3033
3125
  }
3034
3126
 
3035
3127
  // The actual implementation of batched MultiGet. Parameters -
@@ -3168,22 +3260,55 @@ void DBImpl::MultiGetEntity(const ReadOptions& _read_options, size_t num_keys,
3168
3260
  ColumnFamilyHandle** column_families,
3169
3261
  const Slice* keys, PinnableWideColumns* results,
3170
3262
  Status* statuses, bool sorted_input) {
3263
+ assert(statuses);
3264
+
3265
+ if (!column_families) {
3266
+ const Status s = Status::InvalidArgument(
3267
+ "Cannot call MultiGetEntity without column families");
3268
+ for (size_t i = 0; i < num_keys; ++i) {
3269
+ statuses[i] = s;
3270
+ }
3271
+
3272
+ return;
3273
+ }
3274
+
3275
+ if (!keys) {
3276
+ const Status s =
3277
+ Status::InvalidArgument("Cannot call MultiGetEntity without keys");
3278
+ for (size_t i = 0; i < num_keys; ++i) {
3279
+ statuses[i] = s;
3280
+ }
3281
+
3282
+ return;
3283
+ }
3284
+
3285
+ if (!results) {
3286
+ const Status s = Status::InvalidArgument(
3287
+ "Cannot call MultiGetEntity without PinnableWideColumns objects");
3288
+ for (size_t i = 0; i < num_keys; ++i) {
3289
+ statuses[i] = s;
3290
+ }
3291
+
3292
+ return;
3293
+ }
3294
+
3171
3295
  if (_read_options.io_activity != Env::IOActivity::kUnknown &&
3172
3296
  _read_options.io_activity != Env::IOActivity::kMultiGetEntity) {
3173
- Status s = Status::InvalidArgument(
3174
- "Can only call MultiGetEntity with `ReadOptions::io_activity` is "
3297
+ const Status s = Status::InvalidArgument(
3298
+ "Can only call MultiGetEntity with `ReadOptions::io_activity` set to "
3175
3299
  "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGetEntity`");
3176
3300
  for (size_t i = 0; i < num_keys; ++i) {
3177
- if (statuses[i].ok()) {
3178
- statuses[i] = s;
3179
- }
3301
+ statuses[i] = s;
3180
3302
  }
3303
+
3181
3304
  return;
3182
3305
  }
3306
+
3183
3307
  ReadOptions read_options(_read_options);
3184
3308
  if (read_options.io_activity == Env::IOActivity::kUnknown) {
3185
3309
  read_options.io_activity = Env::IOActivity::kMultiGetEntity;
3186
3310
  }
3311
+
3187
3312
  MultiGetCommon(read_options, num_keys, column_families, keys,
3188
3313
  /* values */ nullptr, results, /* timestamps */ nullptr,
3189
3314
  statuses, sorted_input);
@@ -3193,22 +3318,54 @@ void DBImpl::MultiGetEntity(const ReadOptions& _read_options,
3193
3318
  ColumnFamilyHandle* column_family, size_t num_keys,
3194
3319
  const Slice* keys, PinnableWideColumns* results,
3195
3320
  Status* statuses, bool sorted_input) {
3321
+ assert(statuses);
3322
+
3323
+ if (!column_family) {
3324
+ const Status s = Status::InvalidArgument(
3325
+ "Cannot call MultiGetEntity without a column family handle");
3326
+ for (size_t i = 0; i < num_keys; ++i) {
3327
+ statuses[i] = s;
3328
+ }
3329
+
3330
+ return;
3331
+ }
3332
+
3333
+ if (!keys) {
3334
+ const Status s =
3335
+ Status::InvalidArgument("Cannot call MultiGetEntity without keys");
3336
+ for (size_t i = 0; i < num_keys; ++i) {
3337
+ statuses[i] = s;
3338
+ }
3339
+
3340
+ return;
3341
+ }
3342
+
3343
+ if (!results) {
3344
+ const Status s = Status::InvalidArgument(
3345
+ "Cannot call MultiGetEntity without PinnableWideColumns objects");
3346
+ for (size_t i = 0; i < num_keys; ++i) {
3347
+ statuses[i] = s;
3348
+ }
3349
+
3350
+ return;
3351
+ }
3352
+
3196
3353
  if (_read_options.io_activity != Env::IOActivity::kUnknown &&
3197
3354
  _read_options.io_activity != Env::IOActivity::kMultiGetEntity) {
3198
- Status s = Status::InvalidArgument(
3199
- "Can only call MultiGetEntity with `ReadOptions::io_activity` is "
3355
+ const Status s = Status::InvalidArgument(
3356
+ "Can only call MultiGetEntity with `ReadOptions::io_activity` set to "
3200
3357
  "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGetEntity`");
3201
3358
  for (size_t i = 0; i < num_keys; ++i) {
3202
- if (statuses[i].ok()) {
3203
- statuses[i] = s;
3204
- }
3359
+ statuses[i] = s;
3205
3360
  }
3206
3361
  return;
3207
3362
  }
3363
+
3208
3364
  ReadOptions read_options(_read_options);
3209
3365
  if (read_options.io_activity == Env::IOActivity::kUnknown) {
3210
3366
  read_options.io_activity = Env::IOActivity::kMultiGetEntity;
3211
3367
  }
3368
+
3212
3369
  MultiGetCommon(read_options, column_family, num_keys, keys,
3213
3370
  /* values */ nullptr, results, /* timestamps */ nullptr,
3214
3371
  statuses, sorted_input);
@@ -3217,18 +3374,34 @@ void DBImpl::MultiGetEntity(const ReadOptions& _read_options,
3217
3374
  void DBImpl::MultiGetEntity(const ReadOptions& _read_options, size_t num_keys,
3218
3375
  const Slice* keys,
3219
3376
  PinnableAttributeGroups* results) {
3377
+ assert(results);
3378
+
3379
+ if (!keys) {
3380
+ const Status s =
3381
+ Status::InvalidArgument("Cannot call MultiGetEntity without keys");
3382
+ for (size_t i = 0; i < num_keys; ++i) {
3383
+ for (size_t j = 0; j < results[i].size(); ++j) {
3384
+ results[i][j].SetStatus(s);
3385
+ }
3386
+ }
3387
+
3388
+ return;
3389
+ }
3390
+
3220
3391
  if (_read_options.io_activity != Env::IOActivity::kUnknown &&
3221
3392
  _read_options.io_activity != Env::IOActivity::kMultiGetEntity) {
3222
- Status s = Status::InvalidArgument(
3223
- "Can only call MultiGetEntity with ReadOptions::io_activity` is "
3393
+ const Status s = Status::InvalidArgument(
3394
+ "Can only call MultiGetEntity with `ReadOptions::io_activity` set to "
3224
3395
  "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGetEntity`");
3225
3396
  for (size_t i = 0; i < num_keys; ++i) {
3226
3397
  for (size_t j = 0; j < results[i].size(); ++j) {
3227
3398
  results[i][j].SetStatus(s);
3228
3399
  }
3229
3400
  }
3401
+
3230
3402
  return;
3231
3403
  }
3404
+
3232
3405
  ReadOptions read_options(_read_options);
3233
3406
  if (read_options.io_activity == Env::IOActivity::kUnknown) {
3234
3407
  read_options.io_activity = Env::IOActivity::kMultiGetEntity;
@@ -3246,6 +3419,7 @@ void DBImpl::MultiGetEntity(const ReadOptions& _read_options, size_t num_keys,
3246
3419
  ++total_count;
3247
3420
  }
3248
3421
  }
3422
+
3249
3423
  std::vector<Status> statuses(total_count);
3250
3424
  std::vector<PinnableWideColumns> columns(total_count);
3251
3425
  MultiGetCommon(read_options, total_count, column_families.data(),
@@ -3266,6 +3440,15 @@ void DBImpl::MultiGetEntity(const ReadOptions& _read_options, size_t num_keys,
3266
3440
  }
3267
3441
  }
3268
3442
 
3443
+ void DBImpl::MultiGetEntityWithCallback(
3444
+ const ReadOptions& read_options, ColumnFamilyHandle* column_family,
3445
+ ReadCallback* callback,
3446
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
3447
+ assert(read_options.io_activity == Env::IOActivity::kMultiGetEntity);
3448
+
3449
+ MultiGetWithCallbackImpl(read_options, column_family, callback, sorted_keys);
3450
+ }
3451
+
3269
3452
  Status DBImpl::WrapUpCreateColumnFamilies(
3270
3453
  const ReadOptions& read_options, const WriteOptions& write_options,
3271
3454
  const std::vector<const ColumnFamilyOptions*>& cf_options) {
@@ -3744,29 +3927,49 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(
3744
3927
  return db_iter;
3745
3928
  }
3746
3929
 
3747
- std::unique_ptr<Iterator> DBImpl::NewMultiCfIterator(
3930
+ std::unique_ptr<Iterator> DBImpl::NewCoalescingIterator(
3748
3931
  const ReadOptions& _read_options,
3749
3932
  const std::vector<ColumnFamilyHandle*>& column_families) {
3933
+ return NewMultiCfIterator<Iterator, CoalescingIterator>(
3934
+ _read_options, column_families, [](const Status& s) {
3935
+ return std::unique_ptr<Iterator>(NewErrorIterator(s));
3936
+ });
3937
+ }
3938
+
3939
+ std::unique_ptr<AttributeGroupIterator> DBImpl::NewAttributeGroupIterator(
3940
+ const ReadOptions& _read_options,
3941
+ const std::vector<ColumnFamilyHandle*>& column_families) {
3942
+ return NewMultiCfIterator<AttributeGroupIterator, AttributeGroupIteratorImpl>(
3943
+ _read_options, column_families,
3944
+ [](const Status& s) { return NewAttributeGroupErrorIterator(s); });
3945
+ }
3946
+
3947
+ template <typename IterType, typename ImplType, typename ErrorIteratorFuncType>
3948
+ std::unique_ptr<IterType> DBImpl::NewMultiCfIterator(
3949
+ const ReadOptions& _read_options,
3950
+ const std::vector<ColumnFamilyHandle*>& column_families,
3951
+ ErrorIteratorFuncType error_iterator_func) {
3750
3952
  if (column_families.size() == 0) {
3751
- return std::unique_ptr<Iterator>(NewErrorIterator(
3752
- Status::InvalidArgument("No Column Family was provided")));
3953
+ return error_iterator_func(
3954
+ Status::InvalidArgument("No Column Family was provided"));
3753
3955
  }
3754
3956
  const Comparator* first_comparator = column_families[0]->GetComparator();
3755
3957
  for (size_t i = 1; i < column_families.size(); ++i) {
3756
3958
  const Comparator* cf_comparator = column_families[i]->GetComparator();
3757
3959
  if (first_comparator != cf_comparator &&
3758
3960
  first_comparator->GetId().compare(cf_comparator->GetId()) != 0) {
3759
- return std::unique_ptr<Iterator>(NewErrorIterator(Status::InvalidArgument(
3760
- "Different comparators are being used across CFs")));
3961
+ return error_iterator_func(Status::InvalidArgument(
3962
+ "Different comparators are being used across CFs"));
3761
3963
  }
3762
3964
  }
3763
3965
  std::vector<Iterator*> child_iterators;
3764
3966
  Status s = NewIterators(_read_options, column_families, &child_iterators);
3765
- if (s.ok()) {
3766
- return std::make_unique<MultiCfIterator>(first_comparator, column_families,
3767
- std::move(child_iterators));
3967
+ if (!s.ok()) {
3968
+ return error_iterator_func(s);
3768
3969
  }
3769
- return std::unique_ptr<Iterator>(NewErrorIterator(s));
3970
+ return std::make_unique<ImplType>(column_families[0]->GetComparator(),
3971
+ column_families,
3972
+ std::move(child_iterators));
3770
3973
  }
3771
3974
 
3772
3975
  Status DBImpl::NewIterators(
@@ -3791,69 +3994,62 @@ Status DBImpl::NewIterators(
3791
3994
  "ReadTier::kPersistedData is not yet supported in iterators.");
3792
3995
  }
3793
3996
 
3794
- if (read_options.timestamp) {
3795
- for (auto* cf : column_families) {
3796
- assert(cf);
3797
- const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp));
3798
- if (!s.ok()) {
3799
- return s;
3800
- }
3997
+ autovector<ColumnFamilySuperVersionPair, MultiGetContext::MAX_BATCH_SIZE>
3998
+ cf_sv_pairs;
3999
+
4000
+ Status s;
4001
+ for (auto* cf : column_families) {
4002
+ assert(cf);
4003
+ if (read_options.timestamp) {
4004
+ s = FailIfTsMismatchCf(cf, *(read_options.timestamp));
4005
+ } else {
4006
+ s = FailIfCfHasTs(cf);
3801
4007
  }
3802
- } else {
3803
- for (auto* cf : column_families) {
3804
- assert(cf);
3805
- const Status s = FailIfCfHasTs(cf);
3806
- if (!s.ok()) {
3807
- return s;
3808
- }
4008
+ if (!s.ok()) {
4009
+ return s;
3809
4010
  }
4011
+ cf_sv_pairs.emplace_back(cf, nullptr);
3810
4012
  }
3811
-
3812
4013
  iterators->clear();
3813
4014
  iterators->reserve(column_families.size());
3814
- autovector<std::tuple<ColumnFamilyHandleImpl*, SuperVersion*>> cfh_to_sv;
3815
- const bool check_read_ts =
3816
- read_options.timestamp && read_options.timestamp->size() > 0;
3817
- for (auto cf : column_families) {
3818
- auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(cf);
3819
- auto cfd = cfh->cfd();
3820
- SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
3821
- cfh_to_sv.emplace_back(cfh, sv);
3822
- if (check_read_ts) {
3823
- const Status s =
3824
- FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp));
3825
- if (!s.ok()) {
3826
- for (auto prev_entry : cfh_to_sv) {
3827
- CleanupSuperVersion(std::get<1>(prev_entry));
3828
- }
3829
- return s;
3830
- }
3831
- }
4015
+
4016
+ SequenceNumber consistent_seqnum = kMaxSequenceNumber;
4017
+ bool sv_from_thread_local = false;
4018
+ s = MultiCFSnapshot<autovector<ColumnFamilySuperVersionPair,
4019
+ MultiGetContext::MAX_BATCH_SIZE>>(
4020
+ read_options, nullptr /* read_callback*/,
4021
+ [](autovector<ColumnFamilySuperVersionPair,
4022
+ MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
4023
+ return &(*cf_iter);
4024
+ },
4025
+ &cf_sv_pairs,
4026
+ /* extra_sv_ref */ true, &consistent_seqnum, &sv_from_thread_local);
4027
+ if (!s.ok()) {
4028
+ return s;
3832
4029
  }
3833
- assert(cfh_to_sv.size() == column_families.size());
4030
+
4031
+ assert(cf_sv_pairs.size() == column_families.size());
3834
4032
  if (read_options.tailing) {
3835
- for (auto [cfh, sv] : cfh_to_sv) {
3836
- auto iter = new ForwardIterator(this, read_options, cfh->cfd(), sv,
4033
+ for (const auto& cf_sv_pair : cf_sv_pairs) {
4034
+ auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
4035
+ cf_sv_pair.super_version,
3837
4036
  /* allow_unprepared_value */ true);
3838
- iterators->push_back(NewDBIterator(
3839
- env_, read_options, *cfh->cfd()->ioptions(), sv->mutable_cf_options,
3840
- cfh->cfd()->user_comparator(), iter, sv->current, kMaxSequenceNumber,
3841
- sv->mutable_cf_options.max_sequential_skip_in_iterations,
3842
- nullptr /*read_callback*/, cfh));
4037
+ iterators->push_back(
4038
+ NewDBIterator(env_, read_options, *cf_sv_pair.cfd->ioptions(),
4039
+ cf_sv_pair.super_version->mutable_cf_options,
4040
+ cf_sv_pair.cfd->user_comparator(), iter,
4041
+ cf_sv_pair.super_version->current, kMaxSequenceNumber,
4042
+ cf_sv_pair.super_version->mutable_cf_options
4043
+ .max_sequential_skip_in_iterations,
4044
+ nullptr /*read_callback*/, cf_sv_pair.cfh));
3843
4045
  }
3844
4046
  } else {
3845
- // Note: no need to consider the special case of
3846
- // last_seq_same_as_publish_seq_==false since NewIterators is overridden in
3847
- // WritePreparedTxnDB
3848
- auto snapshot = read_options.snapshot != nullptr
3849
- ? read_options.snapshot->GetSequenceNumber()
3850
- : versions_->LastSequence();
3851
- for (auto [cfh, sv] : cfh_to_sv) {
3852
- iterators->push_back(NewIteratorImpl(read_options, cfh, sv, snapshot,
3853
- nullptr /*read_callback*/));
4047
+ for (const auto& cf_sv_pair : cf_sv_pairs) {
4048
+ iterators->push_back(NewIteratorImpl(
4049
+ read_options, cf_sv_pair.cfh, cf_sv_pair.super_version,
4050
+ consistent_seqnum, nullptr /*read_callback*/));
3854
4051
  }
3855
4052
  }
3856
-
3857
4053
  return Status::OK();
3858
4054
  }
3859
4055
 
@@ -3968,8 +4164,8 @@ DBImpl::CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
3968
4164
  std::shared_ptr<const SnapshotImpl> latest =
3969
4165
  timestamped_snapshots_.GetSnapshot(std::numeric_limits<uint64_t>::max());
3970
4166
 
3971
- // If there is already a latest timestamped snapshot, then we need to do some
3972
- // checks.
4167
+ // If there is already a latest timestamped snapshot, then we need to do
4168
+ // some checks.
3973
4169
  if (latest) {
3974
4170
  uint64_t latest_snap_ts = latest->GetTimestamp();
3975
4171
  SequenceNumber latest_snap_seq = latest->GetSequenceNumber();
@@ -3978,8 +4174,8 @@ DBImpl::CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
3978
4174
  Status status;
3979
4175
  std::shared_ptr<const SnapshotImpl> ret;
3980
4176
  if (latest_snap_ts > ts) {
3981
- // A snapshot created later cannot have smaller timestamp than a previous
3982
- // timestamped snapshot.
4177
+ // A snapshot created later cannot have smaller timestamp than a
4178
+ // previous timestamped snapshot.
3983
4179
  needs_create_snap = false;
3984
4180
  std::ostringstream oss;
3985
4181
  oss << "snapshot exists with larger timestamp " << latest_snap_ts << " > "
@@ -4093,7 +4289,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
4093
4289
 
4094
4290
  // Calculate a new threshold, skipping those CFs where compactions are
4095
4291
  // scheduled. We do not do the same pass as the previous loop because
4096
- // mutex might be unlocked during the loop, making the result inaccurate.
4292
+ // mutex might be unlocked during the loop, making the result
4293
+ // inaccurate.
4097
4294
  SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
4098
4295
  for (auto* cfd : *versions_->GetColumnFamilySet()) {
4099
4296
  if (CfdListContains(cf_scheduled, cfd) ||
@@ -4347,8 +4544,11 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
4347
4544
  if (property_info == nullptr || property_info->handle_int == nullptr) {
4348
4545
  return false;
4349
4546
  }
4547
+ auto aggregator = CreateIntPropertyAggregator(property);
4548
+ if (aggregator == nullptr) {
4549
+ return false;
4550
+ }
4350
4551
 
4351
- uint64_t sum = 0;
4352
4552
  bool ret = true;
4353
4553
  {
4354
4554
  // Needs mutex to protect the list of column families.
@@ -4362,14 +4562,14 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
4362
4562
  // GetIntPropertyInternal may release db mutex and re-acquire it.
4363
4563
  mutex_.AssertHeld();
4364
4564
  if (ret) {
4365
- sum += value;
4565
+ aggregator->Add(cfd, value);
4366
4566
  } else {
4367
4567
  ret = false;
4368
4568
  break;
4369
4569
  }
4370
4570
  }
4371
4571
  }
4372
- *aggregated_value = sum;
4572
+ *aggregated_value = aggregator->Aggregate();
4373
4573
  return ret;
4374
4574
  }
4375
4575
 
@@ -4521,7 +4721,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
4521
4721
  sizes[i] = 0;
4522
4722
  if (options.include_files) {
4523
4723
  sizes[i] += versions_->ApproximateSize(
4524
- options, read_options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
4724
+ options, read_options, v, k1.Encode(), k2.Encode(),
4725
+ /*start_level=*/0,
4525
4726
  /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
4526
4727
  }
4527
4728
  if (options.include_memtables) {
@@ -4806,9 +5007,9 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
4806
5007
  static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
4807
5008
  auto* sv = GetAndRefSuperVersion(cfd);
4808
5009
  {
4809
- // Without mutex, Version::GetColumnFamilyMetaData will have data race with
4810
- // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
4811
- // this may cause regression. An alternative is to make
5010
+ // Without mutex, Version::GetColumnFamilyMetaData will have data race
5011
+ // with Compaction::MarkFilesBeingCompacted. One solution is to use mutex,
5012
+ // but this may cause regression. An alternative is to make
4812
5013
  // FileMetaData::being_compacted atomic, but it will make FileMetaData
4813
5014
  // non-copy-able. Another option is to separate these variables from
4814
5015
  // original FileMetaData struct, and this requires re-organization of data
@@ -5040,6 +5241,14 @@ Status DestroyDB(const std::string& dbname, const Options& options,
5040
5241
  Env* env = soptions.env;
5041
5242
  std::vector<std::string> filenames;
5042
5243
  bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
5244
+ auto sfm = static_cast_with_check<SstFileManagerImpl>(
5245
+ options.sst_file_manager.get());
5246
+ // Allocate a separate trash bucket to be used by all the to be deleted
5247
+ // files, so we can later wait for this bucket to be empty before return.
5248
+ std::optional<int32_t> bucket;
5249
+ if (sfm) {
5250
+ bucket = sfm->NewTrashBucket();
5251
+ }
5043
5252
 
5044
5253
  // Reset the logger because it holds a handle to the
5045
5254
  // log file and prevents cleanup and directory removal
@@ -5051,6 +5260,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
5051
5260
  /*IODebugContext*=*/nullptr)
5052
5261
  .PermitUncheckedError();
5053
5262
 
5263
+ std::set<std::string> paths_to_delete;
5054
5264
  FileLock* lock;
5055
5265
  const std::string lockname = LockFileName(dbname);
5056
5266
  Status result = env->LockFile(lockname, &lock);
@@ -5067,10 +5277,9 @@ Status DestroyDB(const std::string& dbname, const Options& options,
5067
5277
  del = DestroyDB(path_to_delete, options);
5068
5278
  } else if (type == kTableFile || type == kWalFile ||
5069
5279
  type == kBlobFile) {
5070
- del = DeleteDBFile(
5071
- &soptions, path_to_delete, dbname,
5072
- /*force_bg=*/false,
5073
- /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
5280
+ del = DeleteUnaccountedDBFile(&soptions, path_to_delete, dbname,
5281
+ /*force_bg=*/false,
5282
+ /*force_fg=*/false, bucket);
5074
5283
  } else {
5075
5284
  del = env->DeleteFile(path_to_delete);
5076
5285
  }
@@ -5079,6 +5288,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
5079
5288
  }
5080
5289
  }
5081
5290
  }
5291
+ paths_to_delete.insert(dbname);
5082
5292
 
5083
5293
  std::set<std::string> paths;
5084
5294
  for (const DbPath& db_path : options.db_paths) {
@@ -5100,18 +5310,19 @@ Status DestroyDB(const std::string& dbname, const Options& options,
5100
5310
  (type == kTableFile ||
5101
5311
  type == kBlobFile)) { // Lock file will be deleted at end
5102
5312
  std::string file_path = path + "/" + fname;
5103
- Status del = DeleteDBFile(&soptions, file_path, dbname,
5104
- /*force_bg=*/false, /*force_fg=*/false);
5313
+ Status del = DeleteUnaccountedDBFile(&soptions, file_path, dbname,
5314
+ /*force_bg=*/false,
5315
+ /*force_fg=*/false, bucket);
5105
5316
  if (!del.ok() && result.ok()) {
5106
5317
  result = del;
5107
5318
  }
5108
5319
  }
5109
5320
  }
5110
- // TODO: Should we return an error if we cannot delete the directory?
5111
- env->DeleteDir(path).PermitUncheckedError();
5112
5321
  }
5113
5322
  }
5114
5323
 
5324
+ paths_to_delete.merge(paths);
5325
+
5115
5326
  std::vector<std::string> walDirFiles;
5116
5327
  std::string archivedir = ArchivalDirectory(dbname);
5117
5328
  bool wal_dir_exists = false;
@@ -5135,46 +5346,49 @@ Status DestroyDB(const std::string& dbname, const Options& options,
5135
5346
  // Delete archival files.
5136
5347
  for (const auto& file : archiveFiles) {
5137
5348
  if (ParseFileName(file, &number, &type) && type == kWalFile) {
5138
- Status del =
5139
- DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
5140
- /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
5349
+ Status del = DeleteUnaccountedDBFile(
5350
+ &soptions, archivedir + "/" + file, archivedir,
5351
+ /*force_bg=*/false, /*force_fg=*/!wal_in_db_path, bucket);
5141
5352
  if (!del.ok() && result.ok()) {
5142
5353
  result = del;
5143
5354
  }
5144
5355
  }
5145
5356
  }
5146
- // Ignore error in case dir contains other files
5147
- env->DeleteDir(archivedir).PermitUncheckedError();
5357
+ paths_to_delete.insert(archivedir);
5148
5358
  }
5149
5359
 
5150
5360
  // Delete log files in the WAL dir
5151
5361
  if (wal_dir_exists) {
5152
5362
  for (const auto& file : walDirFiles) {
5153
5363
  if (ParseFileName(file, &number, &type) && type == kWalFile) {
5154
- Status del =
5155
- DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
5156
- soptions.wal_dir, /*force_bg=*/false,
5157
- /*force_fg=*/!wal_in_db_path);
5364
+ Status del = DeleteUnaccountedDBFile(
5365
+ &soptions, LogFileName(soptions.wal_dir, number),
5366
+ soptions.wal_dir, /*force_bg=*/false,
5367
+ /*force_fg=*/!wal_in_db_path, bucket);
5158
5368
  if (!del.ok() && result.ok()) {
5159
5369
  result = del;
5160
5370
  }
5161
5371
  }
5162
5372
  }
5163
- // Ignore error in case dir contains other files
5164
- env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
5373
+ paths_to_delete.insert(soptions.wal_dir);
5165
5374
  }
5166
5375
 
5167
5376
  // Ignore error since state is already gone
5168
5377
  env->UnlockFile(lock).PermitUncheckedError();
5169
5378
  env->DeleteFile(lockname).PermitUncheckedError();
5170
5379
 
5380
+ // Make sure trash files are all cleared before return.
5381
+ if (sfm && bucket.has_value()) {
5382
+ sfm->WaitForEmptyTrashBucket(bucket.value());
5383
+ }
5171
5384
  // sst_file_manager holds a ref to the logger. Make sure the logger is
5172
5385
  // gone before trying to remove the directory.
5173
5386
  soptions.sst_file_manager.reset();
5174
5387
 
5175
5388
  // Ignore error in case dir contains other files
5176
- env->DeleteDir(dbname).PermitUncheckedError();
5177
- ;
5389
+ for (const auto& path_to_delete : paths_to_delete) {
5390
+ env->DeleteDir(path_to_delete).PermitUncheckedError();
5391
+ }
5178
5392
  }
5179
5393
  return result;
5180
5394
  }
@@ -5614,6 +5828,18 @@ Status DBImpl::IngestExternalFiles(
5614
5828
  "timestamps enabled doesn't support ingest behind.");
5615
5829
  }
5616
5830
  }
5831
+ if (ingest_opts.allow_db_generated_files) {
5832
+ if (ingest_opts.write_global_seqno) {
5833
+ return Status::NotSupported(
5834
+ "write_global_seqno is deprecated and does not work with "
5835
+ "allow_db_generated_files.");
5836
+ }
5837
+ if (ingest_opts.move_files) {
5838
+ return Status::NotSupported(
5839
+ "Options move_files and allow_db_generated_files are not "
5840
+ "compatible.");
5841
+ }
5842
+ }
5617
5843
  }
5618
5844
 
5619
5845
  // TODO (yanqin) maybe handle the case in which column_families have