@nxtedition/rocksdb 13.1.4 → 13.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/binding.cc +43 -16
  2. package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
  4. package/deps/rocksdb/rocksdb/Makefile +2 -2
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
  7. package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
  8. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
  9. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
  10. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
  11. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
  12. package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
  13. package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
  14. package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
  15. package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
  16. package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
  39. package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
  40. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
  41. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
  42. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
  52. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
  53. package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
  54. package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
  55. package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
  56. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
  57. package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
  58. package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
  59. package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
  60. package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
  61. package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
  62. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  63. package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
  65. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
  66. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
  67. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
  68. package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
  69. package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
  70. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
  71. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
  72. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
  73. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
  75. package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
  76. package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
  77. package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
  78. package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
  79. package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
  80. package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
  81. package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
  82. package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
  83. package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
  84. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
  85. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
  86. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
  87. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
  88. package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
  89. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
  90. package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
  91. package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
  92. package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
  93. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
  94. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  95. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
  96. package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
  97. package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
  98. package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
  99. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
  100. package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
  101. package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
  102. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
  103. package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
  104. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
  105. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
  106. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
  107. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
  108. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
  109. package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
  110. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
  111. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
  113. package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
  114. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
  115. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
  118. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
  119. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
  120. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
  121. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
  122. package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
  123. package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
  124. package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
  125. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  126. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
  127. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
  128. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
  129. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
  130. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
  131. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
  132. package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
  134. package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
  135. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
  136. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
  137. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
  139. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
  140. package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
  142. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
  143. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
  144. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
  145. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
  147. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
  148. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
  149. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
  150. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
  151. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
  152. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  153. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
  154. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
  155. package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
  156. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
  157. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
  158. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
  159. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
  160. package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
  161. package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
  162. package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
  163. package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
  164. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
  165. package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
  166. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
  167. package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
  168. package/deps/rocksdb/rocksdb/port/port.h +5 -9
  169. package/deps/rocksdb/rocksdb/src.mk +8 -0
  170. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
  171. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
  172. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
  173. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
  174. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
  175. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
  176. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
  177. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
  178. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
  179. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
  180. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
  181. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
  182. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  183. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
  184. package/deps/rocksdb/rocksdb/table/format.cc +3 -3
  185. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
  186. package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
  187. package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
  188. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
  189. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  190. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
  191. package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
  192. package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
  193. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
  194. package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
  195. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
  196. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
  197. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
  198. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
  199. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
  200. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
  201. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
  202. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
  203. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
  204. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
  205. package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
  206. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
  207. package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
  208. package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
  209. package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
  210. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
  211. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
  212. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
  213. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
  214. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
  215. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
  216. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
  217. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
  218. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
  219. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
  220. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
  221. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
  222. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
  223. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
  224. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
  225. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
  226. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
  227. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
  228. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
  229. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
  230. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
  231. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
  232. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
  233. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
  234. package/deps/rocksdb/rocksdb.gyp +2 -0
  235. package/package.json +1 -1
  236. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  237. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -48,6 +48,7 @@
48
48
  #include "db/write_controller.h"
49
49
  #include "db/write_thread.h"
50
50
  #include "logging/event_logger.h"
51
+ #include "memtable/wbwi_memtable.h"
51
52
  #include "monitoring/instrumented_mutex.h"
52
53
  #include "options/db_options.h"
53
54
  #include "port/port.h"
@@ -60,6 +61,7 @@
60
61
  #include "rocksdb/transaction_log.h"
61
62
  #include "rocksdb/user_write_callback.h"
62
63
  #include "rocksdb/utilities/replayer.h"
64
+ #include "rocksdb/utilities/write_batch_with_index.h"
63
65
  #include "rocksdb/write_buffer_manager.h"
64
66
  #include "table/merging_iterator.h"
65
67
  #include "util/autovector.h"
@@ -363,12 +365,10 @@ class DBImpl : public DB {
363
365
  const Snapshot* GetSnapshot() override;
364
366
  void ReleaseSnapshot(const Snapshot* snapshot) override;
365
367
 
366
- // EXPERIMENTAL
367
368
  std::unique_ptr<Iterator> NewCoalescingIterator(
368
369
  const ReadOptions& options,
369
370
  const std::vector<ColumnFamilyHandle*>& column_families) override;
370
371
 
371
- // EXPERIMENTAL
372
372
  std::unique_ptr<AttributeGroupIterator> NewAttributeGroupIterator(
373
373
  const ReadOptions& options,
374
374
  const std::vector<ColumnFamilyHandle*>& column_families) override;
@@ -482,7 +482,8 @@ class DBImpl : public DB {
482
482
 
483
483
  Status GetDbIdentity(std::string& identity) const override;
484
484
 
485
- virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
485
+ virtual Status GetDbIdentityFromIdentityFile(const IOOptions& opts,
486
+ std::string* identity) const;
486
487
 
487
488
  Status GetDbSessionId(std::string& session_id) const override;
488
489
 
@@ -1199,9 +1200,7 @@ class DBImpl : public DB {
1199
1200
 
1200
1201
  uint64_t TEST_total_log_size() const { return total_log_size_; }
1201
1202
 
1202
- // Returns column family name to ImmutableCFOptions map.
1203
- Status TEST_GetAllImmutableCFOptions(
1204
- std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
1203
+ void TEST_GetAllBlockCaches(std::unordered_set<const Cache*>* cache_set);
1205
1204
 
1206
1205
  // Return the lastest MutableCFOptions of a column family
1207
1206
  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
@@ -1470,7 +1469,8 @@ class DBImpl : public DB {
1470
1469
  // The following two functions can only be called when:
1471
1470
  // 1. WriteThread::Writer::EnterUnbatched() is used.
1472
1471
  // 2. db_mutex is NOT held
1473
- Status RenameTempFileToOptionsFile(const std::string& file_name);
1472
+ Status RenameTempFileToOptionsFile(const std::string& file_name,
1473
+ bool is_remote_compaction_enabled);
1474
1474
  Status DeleteObsoleteOptionsFiles();
1475
1475
 
1476
1476
  void NotifyOnManualFlushScheduled(autovector<ColumnFamilyData*> cfds,
@@ -1509,6 +1509,23 @@ class DBImpl : public DB {
1509
1509
 
1510
1510
  void EraseThreadStatusDbInfo() const;
1511
1511
 
1512
+ // For CFs that has updates in `wbwi`, their memtable will be switched,
1513
+ // and `wbwi` will be added as the latest immutable memtable.
1514
+ //
1515
+ // REQUIRES: this thread is currently at the front of the main writer queue.
1516
+ // @param prep_log refers to the WAL that contains prepare record
1517
+ // for the transaction based on wbwi.
1518
+ // @param assigned_seqno Sequence numbers for the ingested memtable.
1519
+ // @param last_seqno the value of versions_->LastSequence() after the write
1520
+ // ingests `wbwi` is done.
1521
+ // @param memtable_updated Whether the same write that ingests wbwi has
1522
+ // updated memtable. This is useful for determining whether to set bg
1523
+ // error when IngestWBWI fails.
1524
+ Status IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
1525
+ const WBWIMemTable::SeqnoRange& assigned_seqno,
1526
+ uint64_t min_prep_log, SequenceNumber last_seqno,
1527
+ bool memtable_updated, bool ignore_missing_cf);
1528
+
1512
1529
  // If disable_memtable is set the application logic must guarantee that the
1513
1530
  // batch will still be skipped from memtable during the recovery. An excption
1514
1531
  // to this is seq_per_batch_ mode, in which since each batch already takes one
@@ -1524,6 +1541,16 @@ class DBImpl : public DB {
1524
1541
  // batch_cnt is expected to be non-zero in seq_per_batch mode and
1525
1542
  // indicates the number of sub-patches. A sub-patch is a subset of the write
1526
1543
  // batch that does not have duplicate keys.
1544
+ // `callback` is called before WAL write.
1545
+ // See more in comment above WriteCallback::Callback().
1546
+ // pre_release_callback is called after WAL write and before memtable write.
1547
+ // See more in comment above PreReleaseCallback::Callback().
1548
+ // post_memtable_callback is called after memtable write but before publishing
1549
+ // the sequence number to readers.
1550
+ //
1551
+ // The main write queue. This is the only write queue that updates
1552
+ // LastSequence. When using one write queue, the same sequence also indicates
1553
+ // the last published sequence.
1527
1554
  Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
1528
1555
  WriteCallback* callback = nullptr,
1529
1556
  UserWriteCallback* user_write_cb = nullptr,
@@ -1531,7 +1558,9 @@ class DBImpl : public DB {
1531
1558
  bool disable_memtable = false, uint64_t* seq_used = nullptr,
1532
1559
  size_t batch_cnt = 0,
1533
1560
  PreReleaseCallback* pre_release_callback = nullptr,
1534
- PostMemTableCallback* post_memtable_callback = nullptr);
1561
+ PostMemTableCallback* post_memtable_callback = nullptr,
1562
+ std::shared_ptr<WriteBatchWithIndex> wbwi = nullptr,
1563
+ uint64_t min_prep_log = 0);
1535
1564
 
1536
1565
  Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
1537
1566
  WriteCallback* callback = nullptr,
@@ -1594,7 +1623,7 @@ class DBImpl : public DB {
1594
1623
  // Read/create DB identity file (as appropriate), and write DB ID to
1595
1624
  // version_edit if provided.
1596
1625
  Status SetupDBId(const WriteOptions& write_options, bool read_only,
1597
- bool is_new_db, VersionEdit* version_edit);
1626
+ bool is_new_db, bool is_retry, VersionEdit* version_edit);
1598
1627
  // Assign db_id_ and write DB ID to version_edit if provided.
1599
1628
  void SetDBId(std::string&& id, bool read_only, VersionEdit* version_edit);
1600
1629
 
@@ -1711,7 +1740,7 @@ class DBImpl : public DB {
1711
1740
 
1712
1741
  struct WriteContext {
1713
1742
  SuperVersionContext superversion_context;
1714
- autovector<MemTable*> memtables_to_free_;
1743
+ autovector<ReadOnlyMemTable*> memtables_to_free_;
1715
1744
 
1716
1745
  explicit WriteContext(bool create_superversion = false)
1717
1746
  : superversion_context(create_superversion) {}
@@ -2053,7 +2082,21 @@ class DBImpl : public DB {
2053
2082
 
2054
2083
  Status TrimMemtableHistory(WriteContext* context);
2055
2084
 
2056
- Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
2085
+ // Switches the current live memtable to immutable/read-only memtable.
2086
+ // A new WAL is created if the current WAL is not empty.
2087
+ // If `new_imm` is not nullptr, it will be added as the newest immutable
2088
+ // memtable, if and only if OK status is returned.
2089
+ // `last_seqno` needs to be provided if `new_imm` is not nullptr. It is
2090
+ // the value of versions_->LastSequence() after the write that ingests new_imm
2091
+ // is done.
2092
+ //
2093
+ // REQUIRES: mutex_ is held
2094
+ // REQUIRES: this thread is currently at the front of the writer queue
2095
+ // REQUIRES: this thread is currently at the front of the 2nd writer queue if
2096
+ // two_write_queues_ is true (This is to simplify the reasoning.)
2097
+ Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2098
+ ReadOnlyMemTable* new_imm = nullptr,
2099
+ SequenceNumber last_seqno = 0);
2057
2100
 
2058
2101
  // Select and output column families qualified for atomic flush in
2059
2102
  // `selected_cfds`. If `provided_candidate_cfds` is non-empty, it will be used
@@ -2091,17 +2134,18 @@ class DBImpl : public DB {
2091
2134
  // memtable pending flush.
2092
2135
  // resuming_from_bg_err indicates whether the caller is attempting to resume
2093
2136
  // from background error.
2094
- Status WaitForFlushMemTable(ColumnFamilyData* cfd,
2095
- const uint64_t* flush_memtable_id = nullptr,
2096
- bool resuming_from_bg_err = false) {
2137
+ Status WaitForFlushMemTable(
2138
+ ColumnFamilyData* cfd, const uint64_t* flush_memtable_id = nullptr,
2139
+ bool resuming_from_bg_err = false,
2140
+ std::optional<FlushReason> flush_reason = std::nullopt) {
2097
2141
  return WaitForFlushMemTables({cfd}, {flush_memtable_id},
2098
- resuming_from_bg_err);
2142
+ resuming_from_bg_err, flush_reason);
2099
2143
  }
2100
2144
  // Wait for memtables to be flushed for multiple column families.
2101
2145
  Status WaitForFlushMemTables(
2102
2146
  const autovector<ColumnFamilyData*>& cfds,
2103
2147
  const autovector<const uint64_t*>& flush_memtable_ids,
2104
- bool resuming_from_bg_err);
2148
+ bool resuming_from_bg_err, std::optional<FlushReason> flush_reason);
2105
2149
 
2106
2150
  inline void WaitForPendingWrites() {
2107
2151
  mutex_.AssertHeld();
@@ -2216,8 +2260,6 @@ class DBImpl : public DB {
2216
2260
  void TrackOrUntrackFiles(const std::vector<std::string>& existing_data_files,
2217
2261
  bool track);
2218
2262
 
2219
- ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
2220
-
2221
2263
  void MaybeScheduleFlushOrCompaction();
2222
2264
 
2223
2265
  struct FlushRequest {
@@ -2897,6 +2939,11 @@ class DBImpl : public DB {
2897
2939
  // garbages, among all column families.
2898
2940
  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
2899
2941
 
2942
+ // The min threshold to trigger compactions for standalone range deletion
2943
+ // files that are marked for compaction.
2944
+ SequenceNumber standalone_range_deletion_files_mark_threshold_ =
2945
+ kMaxSequenceNumber;
2946
+
2900
2947
  LogsWithPrepTracker logs_with_prep_tracker_;
2901
2948
 
2902
2949
  // Callback for compaction to check if a key is visible to a snapshot.
@@ -3003,7 +3050,8 @@ CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions,
3003
3050
  VersionEdit GetDBRecoveryEditForObsoletingMemTables(
3004
3051
  VersionSet* vset, const ColumnFamilyData& cfd,
3005
3052
  const autovector<VersionEdit*>& edit_list,
3006
- const autovector<MemTable*>& memtables, LogsWithPrepTracker* prep_tracker);
3053
+ const autovector<ReadOnlyMemTable*>& memtables,
3054
+ LogsWithPrepTracker* prep_tracker);
3007
3055
 
3008
3056
  // Return the earliest log file to keep after the memtable flush is
3009
3057
  // finalized.
@@ -3014,13 +3062,13 @@ VersionEdit GetDBRecoveryEditForObsoletingMemTables(
3014
3062
  uint64_t PrecomputeMinLogNumberToKeep2PC(
3015
3063
  VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
3016
3064
  const autovector<VersionEdit*>& edit_list,
3017
- const autovector<MemTable*>& memtables_to_flush,
3065
+ const autovector<ReadOnlyMemTable*>& memtables_to_flush,
3018
3066
  LogsWithPrepTracker* prep_tracker);
3019
3067
  // For atomic flush.
3020
3068
  uint64_t PrecomputeMinLogNumberToKeep2PC(
3021
3069
  VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
3022
3070
  const autovector<autovector<VersionEdit*>>& edit_lists,
3023
- const autovector<const autovector<MemTable*>*>& memtables_to_flush,
3071
+ const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush,
3024
3072
  LogsWithPrepTracker* prep_tracker);
3025
3073
 
3026
3074
  // In non-2PC mode, WALs with log number < the returned number can be
@@ -3037,11 +3085,11 @@ uint64_t PrecomputeMinLogNumberToKeepNon2PC(
3037
3085
  // will not depend on any WAL file. nullptr means no memtable is being flushed.
3038
3086
  // The function is only applicable to 2pc mode.
3039
3087
  uint64_t FindMinPrepLogReferencedByMemTable(
3040
- VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
3088
+ VersionSet* vset, const autovector<ReadOnlyMemTable*>& memtables_to_flush);
3041
3089
  // For atomic flush.
3042
3090
  uint64_t FindMinPrepLogReferencedByMemTable(
3043
3091
  VersionSet* vset,
3044
- const autovector<const autovector<MemTable*>*>& memtables_to_flush);
3092
+ const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush);
3045
3093
 
3046
3094
  // Fix user-supplied options to be reasonable
3047
3095
  template <class T, class V>
@@ -753,7 +753,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
753
753
 
754
754
  if (s.ok()) {
755
755
  autovector<ColumnFamilyData*> tmp_cfds;
756
- autovector<const autovector<MemTable*>*> mems_list;
756
+ autovector<const autovector<ReadOnlyMemTable*>*> mems_list;
757
757
  autovector<const MutableCFOptions*> mutable_cf_options_list;
758
758
  autovector<FileMetaData*> tmp_file_meta;
759
759
  autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
@@ -1457,11 +1457,6 @@ Status DBImpl::CompactFilesImpl(
1457
1457
  input_set.insert(TableFileNameToNumber(file_name));
1458
1458
  }
1459
1459
 
1460
- ColumnFamilyMetaData cf_meta;
1461
- // TODO(yhchiang): can directly use version here if none of the
1462
- // following functions call is pluggable to external developers.
1463
- version->GetColumnFamilyMetaData(&cf_meta);
1464
-
1465
1460
  if (output_path_id < 0) {
1466
1461
  if (cfd->ioptions()->cf_paths.size() == 1U) {
1467
1462
  output_path_id = 0;
@@ -1482,7 +1477,7 @@ Status DBImpl::CompactFilesImpl(
1482
1477
 
1483
1478
  std::vector<CompactionInputFiles> input_files;
1484
1479
  Status s = cfd->compaction_picker()->SanitizeAndConvertCompactionInputFiles(
1485
- &input_set, cf_meta, output_level, version->storage_info(), &input_files);
1480
+ &input_set, output_level, version, &input_files);
1486
1481
  TEST_SYNC_POINT(
1487
1482
  "DBImpl::CompactFilesImpl::PostSanitizeAndConvertCompactionInputFiles");
1488
1483
  if (!s.ok()) {
@@ -1862,8 +1857,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
1862
1857
  mutable_cf_options.compression_opts,
1863
1858
  mutable_cf_options.default_write_temperature,
1864
1859
  0 /* max_subcompactions, not applicable */,
1865
- {} /* grandparents, not applicable */, false /* is manual */,
1866
- "" /* trim_ts */, -1 /* score, not applicable */,
1860
+ {} /* grandparents, not applicable */,
1861
+ std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
1862
+ false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */,
1867
1863
  false /* is deletion compaction, not applicable */,
1868
1864
  false /* l0_files_might_overlap, not applicable */,
1869
1865
  CompactionReason::kRefitLevel));
@@ -2417,7 +2413,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2417
2413
  }
2418
2414
  s = WaitForFlushMemTables(
2419
2415
  cfds, flush_memtable_ids,
2420
- flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */);
2416
+ flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */,
2417
+ flush_reason);
2421
2418
  InstrumentedMutexLock lock_guard(&mutex_);
2422
2419
  for (auto* tmp_cfd : cfds) {
2423
2420
  tmp_cfd->UnrefAndTryDelete();
@@ -2559,7 +2556,8 @@ Status DBImpl::AtomicFlushMemTables(
2559
2556
  }
2560
2557
  s = WaitForFlushMemTables(
2561
2558
  cfds, flush_memtable_ids,
2562
- flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */);
2559
+ flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */,
2560
+ flush_reason);
2563
2561
  InstrumentedMutexLock lock_guard(&mutex_);
2564
2562
  for (auto* cfd : cfds) {
2565
2563
  cfd->UnrefAndTryDelete();
@@ -2622,7 +2620,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
2622
2620
  flush_memtable_id_ptrs.push_back(&flush_memtable_id);
2623
2621
  }
2624
2622
  s = WaitForFlushMemTables(cfds, flush_memtable_id_ptrs,
2625
- true /* resuming_from_bg_err */);
2623
+ true /* resuming_from_bg_err */, flush_reason);
2626
2624
  mutex_.Lock();
2627
2625
  }
2628
2626
 
@@ -2722,7 +2720,7 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
2722
2720
  Status DBImpl::WaitForFlushMemTables(
2723
2721
  const autovector<ColumnFamilyData*>& cfds,
2724
2722
  const autovector<const uint64_t*>& flush_memtable_ids,
2725
- bool resuming_from_bg_err) {
2723
+ bool resuming_from_bg_err, std::optional<FlushReason> flush_reason) {
2726
2724
  int num = static_cast<int>(cfds.size());
2727
2725
  // Wait until the compaction completes
2728
2726
  InstrumentedMutexLock l(&mutex_);
@@ -2760,7 +2758,15 @@ Status DBImpl::WaitForFlushMemTables(
2760
2758
  (flush_memtable_ids[i] != nullptr &&
2761
2759
  cfds[i]->imm()->GetEarliestMemTableID() >
2762
2760
  *flush_memtable_ids[i])) {
2763
- ++num_finished;
2761
+ // Make file ingestion's flush wait until SuperVersion is also updated
2762
+ // since after flush, it does range overlapping check and file level
2763
+ // assignment with the current SuperVersion.
2764
+ if (!flush_reason.has_value() ||
2765
+ flush_reason.value() != FlushReason::kExternalFileIngestion ||
2766
+ cfds[i]->GetSuperVersion()->imm->GetID() ==
2767
+ cfds[i]->imm()->current()->GetID()) {
2768
+ ++num_finished;
2769
+ }
2764
2770
  }
2765
2771
  }
2766
2772
  if (1 == num_dropped && 1 == num) {
@@ -3679,8 +3685,20 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3679
3685
  // compaction is not necessary. Need to make sure mutex is held
3680
3686
  // until we make a copy in the following code
3681
3687
  TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
3688
+ SnapshotChecker* snapshot_checker = nullptr;
3689
+ std::vector<SequenceNumber> snapshot_seqs;
3690
+ // This info is not useful for other scenarios, so save querying existing
3691
+ // snapshots for those cases.
3692
+ if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
3693
+ cfd->user_comparator()->timestamp_size() == 0) {
3694
+ SequenceNumber earliest_write_conflict_snapshot;
3695
+ GetSnapshotContext(job_context, &snapshot_seqs,
3696
+ &earliest_write_conflict_snapshot,
3697
+ &snapshot_checker);
3698
+ assert(is_snapshot_supported_ || snapshots_.empty());
3699
+ }
3682
3700
  c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_,
3683
- log_buffer));
3701
+ snapshot_seqs, snapshot_checker, log_buffer));
3684
3702
  TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
3685
3703
 
3686
3704
  if (c != nullptr) {
@@ -3968,7 +3986,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3968
3986
  // Sanity checking that compaction files are freed.
3969
3987
  for (size_t i = 0; i < c->num_input_levels(); i++) {
3970
3988
  for (size_t j = 0; j < c->inputs(i)->size(); j++) {
3971
- assert(!c->input(i, j)->being_compacted);
3989
+ // When status is not OK, compaction's result installation failed and
3990
+ // no new Version installed. The files could have been released and
3991
+ // picked up again by other compaction attempts.
3992
+ assert(!c->input(i, j)->being_compacted || !status.ok());
3972
3993
  }
3973
3994
  }
3974
3995
  std::unordered_set<Compaction*>* cip = c->column_family_data()
@@ -4287,12 +4308,18 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
4287
4308
  // newer snapshot created and released frequently, the compaction will be
4288
4309
  // triggered soon anyway.
4289
4310
  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
4311
+ standalone_range_deletion_files_mark_threshold_ = kMaxSequenceNumber;
4290
4312
  for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
4291
4313
  if (!my_cfd->ioptions()->allow_ingest_behind) {
4292
4314
  bottommost_files_mark_threshold_ = std::min(
4293
4315
  bottommost_files_mark_threshold_,
4294
4316
  my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
4295
4317
  }
4318
+ standalone_range_deletion_files_mark_threshold_ =
4319
+ std::min(standalone_range_deletion_files_mark_threshold_,
4320
+ cfd->current()
4321
+ ->storage_info()
4322
+ ->standalone_range_tombstone_files_mark_threshold());
4296
4323
  }
4297
4324
 
4298
4325
  // Whenever we install new SuperVersion, we might need to issue new flushes or
@@ -6,8 +6,8 @@
6
6
  // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
7
  // Use of this source code is governed by a BSD-style license that can be
8
8
  // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
-
10
9
  #ifndef NDEBUG
10
+ #include <iostream>
11
11
 
12
12
  #include "db/blob/blob_file_cache.h"
13
13
  #include "db/column_family.h"
@@ -233,23 +233,16 @@ uint64_t DBImpl::TEST_LogfileNumber() {
233
233
  return logfile_number_;
234
234
  }
235
235
 
236
- Status DBImpl::TEST_GetAllImmutableCFOptions(
237
- std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
238
- std::vector<std::string> cf_names;
239
- std::vector<const ImmutableCFOptions*> iopts;
240
- {
241
- InstrumentedMutexLock l(&mutex_);
242
- for (auto cfd : *versions_->GetColumnFamilySet()) {
243
- cf_names.push_back(cfd->GetName());
244
- iopts.push_back(cfd->ioptions());
236
+ void DBImpl::TEST_GetAllBlockCaches(
237
+ std::unordered_set<const Cache*>* cache_set) {
238
+ InstrumentedMutexLock l(&mutex_);
239
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
240
+ if (const auto bbto =
241
+ cfd->GetCurrentMutableCFOptions()
242
+ ->table_factory->GetOptions<BlockBasedTableOptions>()) {
243
+ cache_set->insert(bbto->block_cache.get());
245
244
  }
246
245
  }
247
- iopts_map->clear();
248
- for (size_t i = 0; i < cf_names.size(); ++i) {
249
- iopts_map->insert({cf_names[i], iopts[i]});
250
- }
251
-
252
- return Status::OK();
253
246
  }
254
247
 
255
248
  uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
@@ -265,7 +258,7 @@ size_t DBImpl::TEST_LogsWithPrepSize() {
265
258
  }
266
259
 
267
260
  uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
268
- autovector<MemTable*> empty_list;
261
+ autovector<ReadOnlyMemTable*> empty_list;
269
262
  return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
270
263
  }
271
264
 
@@ -345,31 +338,52 @@ void DBImpl::TEST_VerifyNoObsoleteFilesCached(
345
338
  l.emplace(&mutex_);
346
339
  }
347
340
 
348
- std::vector<uint64_t> live_files;
341
+ if (!opened_successfully_) {
342
+ // We don't need to pro-actively clean up open files during DB::Open()
343
+ // if we know we are about to fail and clean up in Close().
344
+ return;
345
+ }
346
+ if (disable_delete_obsolete_files_ > 0) {
347
+ // For better or worse, DB::Close() is allowed with deletions disabled.
348
+ // Since we generally associate clean-up of open files with deleting them,
349
+ // we allow "obsolete" open files when deletions are disabled.
350
+ return;
351
+ }
352
+
353
+ // Live and "quarantined" files are allowed to be open in table cache
354
+ std::set<uint64_t> live_and_quar_files;
349
355
  for (auto cfd : *versions_->GetColumnFamilySet()) {
350
356
  if (cfd->IsDropped()) {
351
357
  continue;
352
358
  }
353
- // Sneakily add both SST and blob files to the same list
354
- cfd->current()->AddLiveFiles(&live_files, &live_files);
359
+ // Iterate over live versions
360
+ Version* current = cfd->current();
361
+ Version* ver = current;
362
+ do {
363
+ // Sneakily add both SST and blob files to the same list
364
+ std::vector<uint64_t> live_files_vec;
365
+ ver->AddLiveFiles(&live_files_vec, &live_files_vec);
366
+ live_and_quar_files.insert(live_files_vec.begin(), live_files_vec.end());
367
+
368
+ ver = ver->Next();
369
+ } while (ver != current);
355
370
  }
356
- std::sort(live_files.begin(), live_files.end());
357
-
358
- auto fn = [&live_files](const Slice& key, Cache::ObjectPtr, size_t,
359
- const Cache::CacheItemHelper* helper) {
360
- if (helper != BlobFileCache::GetHelper()) {
361
- // Skip non-blob files for now
362
- // FIXME: diagnose and fix the leaks of obsolete SST files revealed in
363
- // unit tests.
364
- return;
365
- }
371
+ {
372
+ const auto& quar_files = error_handler_.GetFilesToQuarantine();
373
+ live_and_quar_files.insert(quar_files.begin(), quar_files.end());
374
+ }
375
+ auto fn = [&live_and_quar_files](const Slice& key, Cache::ObjectPtr, size_t,
376
+ const Cache::CacheItemHelper*) {
366
377
  // See TableCache and BlobFileCache
367
378
  assert(key.size() == sizeof(uint64_t));
368
379
  uint64_t file_number;
369
380
  GetUnaligned(reinterpret_cast<const uint64_t*>(key.data()), &file_number);
370
- // Assert file is in sorted live_files
371
- assert(
372
- std::binary_search(live_files.begin(), live_files.end(), file_number));
381
+ // Assert file is in live/quarantined set
382
+ if (live_and_quar_files.find(file_number) == live_and_quar_files.end()) {
383
+ std::cerr << "File " << file_number << " is not live nor quarantined"
384
+ << std::endl;
385
+ assert(false);
386
+ }
373
387
  };
374
388
  table_cache_->ApplyToAllEntries(fn, {});
375
389
  }
@@ -449,14 +449,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
449
449
  // File is being deleted (actually obsolete)
450
450
  auto number = file.metadata->fd.GetNumber();
451
451
  candidate_files.emplace_back(MakeTableFileName(number), file.path);
452
- if (handle == nullptr) {
453
- // For files not "pinned" in table cache
454
- handle = TableCache::Lookup(table_cache_.get(), number);
455
- }
456
- if (handle) {
457
- TableCache::ReleaseObsolete(table_cache_.get(), handle,
458
- file.uncache_aggressiveness);
459
- }
452
+ TableCache::ReleaseObsolete(table_cache_.get(), number, handle,
453
+ file.uncache_aggressiveness);
460
454
  }
461
455
  file.DeleteMetadata();
462
456
  }
@@ -572,9 +566,17 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
572
566
  case kTableFile:
573
567
  // If the second condition is not there, this makes
574
568
  // DontDeletePendingOutputs fail
569
+ // FIXME: but should NOT keep if it came from sst_delete_files?
575
570
  keep = (sst_live_set.find(number) != sst_live_set.end()) ||
576
571
  number >= state.min_pending_output;
577
572
  if (!keep) {
573
+ // NOTE: sometimes redundant (if came from sst_delete_files)
574
+ // We don't know which column family is applicable here so we don't
575
+ // know what uncache_aggressiveness would be used with
576
+ // ReleaseObsolete(). Anyway, obsolete files ideally go into
577
+ // sst_delete_files for better/quicker handling, and this is just a
578
+ // backstop.
579
+ TableCache::Evict(table_cache_.get(), number);
578
580
  files_to_del.insert(number);
579
581
  }
580
582
  break;
@@ -739,7 +741,8 @@ void DBImpl::DeleteObsoleteFiles() {
739
741
  VersionEdit GetDBRecoveryEditForObsoletingMemTables(
740
742
  VersionSet* vset, const ColumnFamilyData& cfd,
741
743
  const autovector<VersionEdit*>& edit_list,
742
- const autovector<MemTable*>& memtables, LogsWithPrepTracker* prep_tracker) {
744
+ const autovector<ReadOnlyMemTable*>& memtables,
745
+ LogsWithPrepTracker* prep_tracker) {
743
746
  VersionEdit wal_deletion_edit;
744
747
  uint64_t min_wal_number_to_keep = 0;
745
748
  assert(edit_list.size() > 0);
@@ -769,12 +772,12 @@ VersionEdit GetDBRecoveryEditForObsoletingMemTables(
769
772
  }
770
773
 
771
774
  uint64_t FindMinPrepLogReferencedByMemTable(
772
- VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
775
+ VersionSet* vset, const autovector<ReadOnlyMemTable*>& memtables_to_flush) {
773
776
  uint64_t min_log = 0;
774
777
 
775
778
  // we must look through the memtables for two phase transactions
776
779
  // that have been committed but not yet flushed
777
- std::unordered_set<MemTable*> memtables_to_flush_set(
780
+ std::unordered_set<ReadOnlyMemTable*> memtables_to_flush_set(
778
781
  memtables_to_flush.begin(), memtables_to_flush.end());
779
782
  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
780
783
  if (loop_cfd->IsDropped()) {
@@ -799,12 +802,12 @@ uint64_t FindMinPrepLogReferencedByMemTable(
799
802
  }
800
803
 
801
804
  uint64_t FindMinPrepLogReferencedByMemTable(
802
- VersionSet* vset,
803
- const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
805
+ VersionSet* vset, const autovector<const autovector<ReadOnlyMemTable*>*>&
806
+ memtables_to_flush) {
804
807
  uint64_t min_log = 0;
805
808
 
806
- std::unordered_set<MemTable*> memtables_to_flush_set;
807
- for (const autovector<MemTable*>* memtables : memtables_to_flush) {
809
+ std::unordered_set<ReadOnlyMemTable*> memtables_to_flush_set;
810
+ for (const autovector<ReadOnlyMemTable*>* memtables : memtables_to_flush) {
808
811
  memtables_to_flush_set.insert(memtables->begin(), memtables->end());
809
812
  }
810
813
  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
@@ -896,7 +899,7 @@ uint64_t PrecomputeMinLogNumberToKeepNon2PC(
896
899
  uint64_t PrecomputeMinLogNumberToKeep2PC(
897
900
  VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
898
901
  const autovector<VersionEdit*>& edit_list,
899
- const autovector<MemTable*>& memtables_to_flush,
902
+ const autovector<ReadOnlyMemTable*>& memtables_to_flush,
900
903
  LogsWithPrepTracker* prep_tracker) {
901
904
  assert(vset != nullptr);
902
905
  assert(prep_tracker != nullptr);
@@ -937,7 +940,7 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
937
940
  uint64_t PrecomputeMinLogNumberToKeep2PC(
938
941
  VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
939
942
  const autovector<autovector<VersionEdit*>>& edit_lists,
940
- const autovector<const autovector<MemTable*>*>& memtables_to_flush,
943
+ const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush,
941
944
  LogsWithPrepTracker* prep_tracker) {
942
945
  assert(vset != nullptr);
943
946
  assert(prep_tracker != nullptr);
@@ -980,7 +983,8 @@ void DBImpl::SetDBId(std::string&& id, bool read_only,
980
983
  }
981
984
 
982
985
  Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
983
- bool is_new_db, VersionEdit* version_edit) {
986
+ bool is_new_db, bool is_retry,
987
+ VersionEdit* version_edit) {
984
988
  Status s;
985
989
  if (!is_new_db) {
986
990
  // Check for the IDENTITY file and create it if not there or
@@ -988,7 +992,11 @@ Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
988
992
  std::string db_id_in_file;
989
993
  s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
990
994
  if (s.ok()) {
991
- s = GetDbIdentityFromIdentityFile(&db_id_in_file);
995
+ IOOptions opts;
996
+ if (is_retry) {
997
+ opts.verify_and_reconstruct_read = true;
998
+ }
999
+ s = GetDbIdentityFromIdentityFile(opts, &db_id_in_file);
992
1000
  if (s.ok() && !db_id_in_file.empty()) {
993
1001
  if (db_id_.empty()) {
994
1002
  // Loaded from file and wasn't already known from manifest