@nxtedition/rocksdb 13.1.5 → 13.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/binding.cc +37 -12
  2. package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
  4. package/deps/rocksdb/rocksdb/Makefile +2 -2
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
  7. package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
  8. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
  9. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
  10. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
  11. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
  12. package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
  13. package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
  14. package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
  15. package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
  16. package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
  39. package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
  40. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
  41. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
  42. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
  52. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
  53. package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
  54. package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
  55. package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
  56. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
  57. package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
  58. package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
  59. package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
  60. package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
  61. package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
  62. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  63. package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
  65. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
  66. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
  67. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
  68. package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
  69. package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
  70. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
  71. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
  72. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
  73. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
  75. package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
  76. package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
  77. package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
  78. package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
  79. package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
  80. package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
  81. package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
  82. package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
  83. package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
  84. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
  85. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
  86. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
  87. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
  88. package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
  89. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
  90. package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
  91. package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
  92. package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
  93. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
  94. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  95. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
  96. package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
  97. package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
  98. package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
  99. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
  100. package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
  101. package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
  102. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
  103. package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
  104. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
  105. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
  106. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
  107. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
  108. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
  109. package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
  110. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
  111. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
  113. package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
  114. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
  115. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
  118. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
  119. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
  120. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
  121. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
  122. package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
  123. package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
  124. package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
  125. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  126. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
  127. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
  128. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
  129. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
  130. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
  131. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
  132. package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
  134. package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
  135. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
  136. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
  137. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
  139. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
  140. package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
  142. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
  143. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
  144. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
  145. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
  147. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
  148. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
  149. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
  150. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
  151. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
  152. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  153. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
  154. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
  155. package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
  156. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
  157. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
  158. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
  159. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
  160. package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
  161. package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
  162. package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
  163. package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
  164. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
  165. package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
  166. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
  167. package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
  168. package/deps/rocksdb/rocksdb/port/port.h +5 -9
  169. package/deps/rocksdb/rocksdb/src.mk +8 -0
  170. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
  171. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
  172. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
  173. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
  174. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
  175. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
  176. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
  177. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
  178. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
  179. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
  180. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
  181. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
  182. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  183. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
  184. package/deps/rocksdb/rocksdb/table/format.cc +3 -3
  185. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
  186. package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
  187. package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
  188. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
  189. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  190. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
  191. package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
  192. package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
  193. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
  194. package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
  195. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
  196. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
  197. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
  198. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
  199. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
  200. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
  201. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
  202. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
  203. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
  204. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
  205. package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
  206. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
  207. package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
  208. package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
  209. package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
  210. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
  211. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
  212. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
  213. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
  214. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
  215. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
  216. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
  217. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
  218. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
  219. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
  220. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
  221. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
  222. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
  223. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
  224. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
  225. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
  226. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
  227. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
  228. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
  229. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
  230. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
  231. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
  232. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
  233. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
  234. package/deps/rocksdb/rocksdb.gyp +2 -0
  235. package/package.json +1 -1
  236. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  237. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -301,7 +301,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
301
301
  VersionEdit new_db_edit;
302
302
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
303
303
  Status s = SetupDBId(write_options, /*read_only=*/false, /*is_new_db=*/true,
304
- &new_db_edit);
304
+ /*is_retry=*/false, &new_db_edit);
305
305
  if (!s.ok()) {
306
306
  return s;
307
307
  }
@@ -575,6 +575,7 @@ Status DBImpl::Recover(
575
575
  }
576
576
  if (s.ok() && !read_only) {
577
577
  for (auto cfd : *versions_->GetColumnFamilySet()) {
578
+ auto& moptions = *cfd->GetLatestMutableCFOptions();
578
579
  // Try to trivially move files down the LSM tree to start from bottommost
579
580
  // level when level_compaction_dynamic_level_bytes is enabled. This should
580
581
  // only be useful when user is migrating to turning on this option.
@@ -592,14 +593,14 @@ Status DBImpl::Recover(
592
593
  if (cfd->ioptions()->compaction_style ==
593
594
  CompactionStyle::kCompactionStyleLevel &&
594
595
  cfd->ioptions()->level_compaction_dynamic_level_bytes &&
595
- !cfd->GetLatestMutableCFOptions()->disable_auto_compactions) {
596
+ !moptions.disable_auto_compactions) {
596
597
  int to_level = cfd->ioptions()->num_levels - 1;
597
598
  // last level is reserved
598
599
  // allow_ingest_behind does not support Level Compaction,
599
600
  // and per_key_placement can have infinite compaction loop for Level
600
601
  // Compaction. Adjust to_level here just to be safe.
601
602
  if (cfd->ioptions()->allow_ingest_behind ||
602
- cfd->ioptions()->preclude_last_level_data_seconds > 0) {
603
+ moptions.preclude_last_level_data_seconds > 0) {
603
604
  to_level -= 1;
604
605
  }
605
606
  // Whether this column family has a level trivially moved
@@ -675,11 +676,11 @@ Status DBImpl::Recover(
675
676
  // Already set up DB ID in NewDB
676
677
  } else if (immutable_db_options_.write_dbid_to_manifest && recovery_ctx) {
677
678
  VersionEdit edit;
678
- s = SetupDBId(write_options, read_only, is_new_db, &edit);
679
+ s = SetupDBId(write_options, read_only, is_new_db, is_retry, &edit);
679
680
  recovery_ctx->UpdateVersionEdits(
680
681
  versions_->GetColumnFamilySet()->GetDefault(), edit);
681
682
  } else {
682
- s = SetupDBId(write_options, read_only, is_new_db, nullptr);
683
+ s = SetupDBId(write_options, read_only, is_new_db, is_retry, nullptr);
683
684
  }
684
685
  assert(!s.ok() || !db_id_.empty());
685
686
  ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
@@ -1274,7 +1275,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1274
1275
  reader.GetRecordedTimestampSize();
1275
1276
  status = HandleWriteBatchTimestampSizeDifference(
1276
1277
  &batch, running_ts_sz, record_ts_sz,
1277
- TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch);
1278
+ TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_,
1279
+ batch_per_txn_, &new_batch);
1278
1280
  if (!status.ok()) {
1279
1281
  return status;
1280
1282
  }
@@ -1371,6 +1373,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1371
1373
  }
1372
1374
  }
1373
1375
  }
1376
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
1377
+ "Recovered to log #%" PRIu64 " seq #%" PRIu64, wal_number,
1378
+ *next_sequence);
1374
1379
 
1375
1380
  if (!status.ok() || old_log_record) {
1376
1381
  if (status.IsNotSupported()) {
@@ -1403,10 +1408,6 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1403
1408
  if (corrupted_wal_found != nullptr) {
1404
1409
  *corrupted_wal_found = true;
1405
1410
  }
1406
- ROCKS_LOG_INFO(immutable_db_options_.info_log,
1407
- "Point in time recovered to log #%" PRIu64
1408
- " seq #%" PRIu64,
1409
- wal_number, *next_sequence);
1410
1411
  } else {
1411
1412
  assert(immutable_db_options_.wal_recovery_mode ==
1412
1413
  WALRecoveryMode::kTolerateCorruptedTailRecords ||
@@ -1667,10 +1668,20 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1667
1668
  Arena arena;
1668
1669
  Status s;
1669
1670
  TableProperties table_properties;
1671
+ const auto* ucmp = cfd->internal_comparator().user_comparator();
1672
+ assert(ucmp);
1673
+ const size_t ts_sz = ucmp->timestamp_size();
1674
+ const bool logical_strip_timestamp =
1675
+ ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps;
1670
1676
  {
1671
1677
  ScopedArenaPtr<InternalIterator> iter(
1672
- mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
1673
- /*prefix_extractor=*/nullptr));
1678
+ logical_strip_timestamp
1679
+ ? mem->NewTimestampStrippingIterator(
1680
+ ro, /*seqno_to_time_mapping=*/nullptr, &arena,
1681
+ /*prefix_extractor=*/nullptr, ts_sz)
1682
+ : mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
1683
+ /*prefix_extractor=*/nullptr,
1684
+ /*for_flush=*/true));
1674
1685
  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
1675
1686
  "[%s] [WriteLevel0TableForRecovery]"
1676
1687
  " Level-0 table #%" PRIu64 ": started",
@@ -1705,11 +1716,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1705
1716
  std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
1706
1717
  range_del_iters;
1707
1718
  auto range_del_iter =
1708
- // This is called during recovery, where a live memtable is flushed
1709
- // directly. In this case, no fragmented tombstone list is cached in
1710
- // this memtable yet.
1711
- mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
1712
- false /* immutable_memtable */);
1719
+ logical_strip_timestamp
1720
+ ? mem->NewTimestampStrippingRangeTombstoneIterator(
1721
+ ro, kMaxSequenceNumber, ts_sz)
1722
+ // This is called during recovery, where a live memtable is
1723
+ // flushed directly. In this case, no fragmented tombstone list is
1724
+ // cached in this memtable yet.
1725
+ : mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
1726
+ false /* immutable_memtable */);
1713
1727
  if (range_del_iter != nullptr) {
1714
1728
  range_del_iters.emplace_back(range_del_iter);
1715
1729
  }
@@ -1723,10 +1737,11 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1723
1737
  cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(),
1724
1738
  GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
1725
1739
  mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
1726
- 0 /* level */, false /* is_bottommost */,
1727
- TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
1728
- 0 /* file_creation_time */, db_id_, db_session_id_,
1729
- 0 /* target_file_size */, meta.fd.GetNumber(), kMaxSequenceNumber);
1740
+ 0 /* level */, current_time /* newest_key_time */,
1741
+ false /* is_bottommost */, TableFileCreationReason::kRecovery,
1742
+ 0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
1743
+ db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber(),
1744
+ kMaxSequenceNumber);
1730
1745
  Version* version = cfd->current();
1731
1746
  version->Ref();
1732
1747
  uint64_t num_input_entries = 0;
@@ -1756,7 +1771,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1756
1771
  s = io_s;
1757
1772
  }
1758
1773
 
1759
- uint64_t total_num_entries = mem->num_entries();
1774
+ uint64_t total_num_entries = mem->NumEntries();
1760
1775
  if (s.ok() && total_num_entries != num_input_entries) {
1761
1776
  std::string msg = "Expected " + std::to_string(total_num_entries) +
1762
1777
  " entries in memtable, but read " +
@@ -1795,9 +1810,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1795
1810
 
1796
1811
  // For UDT in memtable only feature, move up the cutoff timestamp whenever
1797
1812
  // a flush happens.
1798
- const Comparator* ucmp = cfd->user_comparator();
1799
- size_t ts_sz = ucmp->timestamp_size();
1800
- if (ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps) {
1813
+ if (logical_strip_timestamp) {
1801
1814
  Slice mem_newest_udt = mem->GetNewestUDT();
1802
1815
  std::string full_history_ts_low = cfd->GetFullHistoryTsLow();
1803
1816
  if (full_history_ts_low.empty() ||
@@ -8,7 +8,7 @@
8
8
  #include "db/arena_wrapped_db_iter.h"
9
9
  #include "db/db_impl/compacted_db_impl.h"
10
10
  #include "db/db_impl/db_impl.h"
11
- #include "db/db_iter.h"
11
+ #include "db/manifest_ops.h"
12
12
  #include "db/merge_context.h"
13
13
  #include "logging/logging.h"
14
14
  #include "monitoring/perf_context_imp.h"
@@ -265,8 +265,8 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
265
265
  const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
266
266
  std::string manifest_path;
267
267
  uint64_t manifest_file_number;
268
- s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
269
- &manifest_file_number);
268
+ s = GetCurrentManifestPath(dbname, fs.get(), /*is_retry=*/false,
269
+ &manifest_path, &manifest_file_number);
270
270
  } else {
271
271
  // Historic behavior that doesn't necessarily make sense
272
272
  s = db_options.env->CreateDirIfMissing(dbname);
@@ -233,7 +233,8 @@ Status DBImplSecondary::RecoverLogFiles(
233
233
  reader->GetRecordedTimestampSize();
234
234
  status = HandleWriteBatchTimestampSizeDifference(
235
235
  &batch, running_ts_sz, record_ts_sz,
236
- TimestampSizeConsistencyMode::kVerifyConsistency);
236
+ TimestampSizeConsistencyMode::kVerifyConsistency, seq_per_batch_,
237
+ batch_per_txn_);
237
238
  if (!status.ok()) {
238
239
  break;
239
240
  }
@@ -247,9 +248,7 @@ Status DBImplSecondary::RecoverLogFiles(
247
248
  if (cfd == nullptr) {
248
249
  continue;
249
250
  }
250
- if (cfds_changed->count(cfd) == 0) {
251
- cfds_changed->insert(cfd);
252
- }
251
+ cfds_changed->insert(cfd);
253
252
  const std::vector<FileMetaData*>& l0_files =
254
253
  cfd->current()->storage_info()->LevelFiles(0);
255
254
  SequenceNumber seq =
@@ -957,6 +956,10 @@ Status DB::OpenAndCompact(
957
956
  config_options.env = override_options.env;
958
957
  std::vector<ColumnFamilyDescriptor> all_column_families;
959
958
 
959
+ TEST_SYNC_POINT_CALLBACK(
960
+ "DBImplSecondary::OpenAndCompact::BeforeLoadingOptions:0",
961
+ &compaction_input.options_file_number);
962
+ TEST_SYNC_POINT("DBImplSecondary::OpenAndCompact::BeforeLoadingOptions:1");
960
963
  std::string options_file_name =
961
964
  OptionsFileName(name, compaction_input.options_file_number);
962
965
 
@@ -12,6 +12,7 @@
12
12
  #include "db/error_handler.h"
13
13
  #include "db/event_helpers.h"
14
14
  #include "logging/logging.h"
15
+ #include "memtable/wbwi_memtable.h"
15
16
  #include "monitoring/perf_context_imp.h"
16
17
  #include "options/options_helper.h"
17
18
  #include "test_util/sync_point.h"
@@ -189,16 +190,137 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
189
190
  return s;
190
191
  }
191
192
 
192
- // The main write queue. This is the only write queue that updates LastSequence.
193
- // When using one write queue, the same sequence also indicates the last
194
- // published sequence.
193
+ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
194
+ const WBWIMemTable::SeqnoRange& assigned_seqno,
195
+ uint64_t prep_log,
196
+ SequenceNumber last_seqno_after_ingest,
197
+ bool memtable_updated, bool ignore_missing_cf) {
198
+ // Keys in new memtable have seqno > last_seqno_after_ingest >= keys in wbwi.
199
+ assert(assigned_seqno.upper_bound <= last_seqno_after_ingest);
200
+ // Keys in the current memtable have seqno <= LastSequence() < keys in wbwi.
201
+ assert(assigned_seqno.lower_bound > versions_->LastSequence());
202
+ autovector<ReadOnlyMemTable*> memtables;
203
+ autovector<ColumnFamilyData*> cfds;
204
+ InstrumentedMutexLock lock(&mutex_);
205
+ ColumnFamilySet* cf_set = versions_->GetColumnFamilySet();
206
+
207
+ // Create WBWIMemTables
208
+ for (const auto [cf_id, stat] : wbwi->GetCFStats()) {
209
+ ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf_id);
210
+ if (!cfd) {
211
+ if (ignore_missing_cf) {
212
+ continue;
213
+ }
214
+ for (auto mem : memtables) {
215
+ mem->Unref();
216
+ delete mem;
217
+ }
218
+ for (auto cfd_ptr : cfds) {
219
+ cfd_ptr->UnrefAndTryDelete();
220
+ }
221
+ Status s = Status::InvalidArgument(
222
+ "Invalid column family id from WriteBatchWithIndex: " +
223
+ std::to_string(cf_id));
224
+ if (memtable_updated) {
225
+ s = Status::Corruption(
226
+ "Part of the write batch is applied. Memtable is in a inconsistent "
227
+ "state. " +
228
+ s.ToString());
229
+ error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
230
+ }
231
+
232
+ return s;
233
+ }
234
+ WBWIMemTable* wbwi_memtable =
235
+ new WBWIMemTable(wbwi, cfd->user_comparator(), cf_id, cfd->ioptions(),
236
+ cfd->GetLatestMutableCFOptions(), stat);
237
+ wbwi_memtable->Ref();
238
+ wbwi_memtable->AssignSequenceNumbers(assigned_seqno);
239
+ // This is needed to keep the WAL that contains Prepare alive until
240
+ // committed data in this memtable is persisted.
241
+ wbwi_memtable->SetMinPrepLog(prep_log);
242
+ memtables.push_back(wbwi_memtable);
243
+ cfd->Ref();
244
+ cfds.push_back(cfd);
245
+ }
246
+
247
+ // Stop writes to the DB by entering both write threads
248
+ WriteThread::Writer nonmem_w;
249
+ if (two_write_queues_) {
250
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
251
+ }
252
+ WaitForPendingWrites();
253
+
254
+ // Switch memtable and add WBWIMemTables
255
+ Status s;
256
+ for (size_t i = 0; i < memtables.size(); ++i) {
257
+ assert(!immutable_db_options_.atomic_flush);
258
+ // NOTE: to support atomic flush, need to call
259
+ // SelectColumnFamiliesForAtomicFlush()
260
+ WriteContext write_context;
261
+ // TODO: not switch on empty memtable, may need to update metadata
262
+ // like NextLogNumber(), earliest_seqno and memtable id.
263
+ s = SwitchMemtable(cfds[i], &write_context, memtables[i],
264
+ last_seqno_after_ingest);
265
+ if (!s.ok()) {
266
+ // SwitchMemtable() can only fail if a new WAL is to be created, this
267
+ // should only happen for the first call to SwitchMemtable(). log will
268
+ // be empty and no new WAL is created for the rest of the calls.
269
+ assert(i == 0);
270
+ if (i != 0 || memtable_updated) {
271
+ // escalate error to non-recoverable
272
+ s = Status::Corruption(
273
+ "Part of the write batch is applied. Memtable is in a inconsistent "
274
+ "state. " +
275
+ s.ToString());
276
+ error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
277
+ } else {
278
+ // SwitchMemtable() already sets appropriate bg error
279
+ }
280
+ for (size_t j = i; j < memtables.size(); j++) {
281
+ memtables[j]->Unref();
282
+ delete memtables[j];
283
+ }
284
+ break;
285
+ }
286
+ }
287
+ for (size_t i = 0; i < cfds.size(); ++i) {
288
+ if (cfds[i]->UnrefAndTryDelete()) {
289
+ cfds[i] = nullptr;
290
+ }
291
+ }
292
+
293
+ // exit the second queue before returning
294
+ if (two_write_queues_) {
295
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
296
+ }
297
+ if (s.ok()) {
298
+ // Trigger flushes for the new immutable memtables.
299
+ for (const auto cfd : cfds) {
300
+ if (cfd == nullptr) {
301
+ continue;
302
+ }
303
+ cfd->imm()->FlushRequested();
304
+ FlushRequest flush_req;
305
+ // TODO: a new flush reason for ingesting memtable
306
+ GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
307
+ &flush_req);
308
+ EnqueuePendingFlush(flush_req);
309
+ }
310
+ MaybeScheduleFlushOrCompaction();
311
+ }
312
+ return s;
313
+ }
314
+
195
315
  Status DBImpl::WriteImpl(const WriteOptions& write_options,
196
316
  WriteBatch* my_batch, WriteCallback* callback,
197
317
  UserWriteCallback* user_write_cb, uint64_t* log_used,
198
318
  uint64_t log_ref, bool disable_memtable,
199
319
  uint64_t* seq_used, size_t batch_cnt,
200
320
  PreReleaseCallback* pre_release_callback,
201
- PostMemTableCallback* post_memtable_callback) {
321
+ PostMemTableCallback* post_memtable_callback,
322
+ std::shared_ptr<WriteBatchWithIndex> wbwi,
323
+ uint64_t prep_log) {
202
324
  assert(!seq_per_batch_ || batch_cnt != 0);
203
325
  assert(my_batch == nullptr || my_batch->Count() == 0 ||
204
326
  write_options.protection_bytes_per_key == 0 ||
@@ -287,6 +409,23 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
287
409
  return Status::NotSupported(
288
410
  "DeleteRange is not compatible with row cache.");
289
411
  }
412
+ if (wbwi) {
413
+ assert(prep_log > 0);
414
+ // Used only in WriteCommittedTxn::CommitInternal() with no `callback`.
415
+ assert(!callback);
416
+ if (immutable_db_options_.unordered_write) {
417
+ return Status::NotSupported(
418
+ "Ingesting WriteBatch does not support unordered_write");
419
+ }
420
+ if (immutable_db_options_.enable_pipelined_write) {
421
+ return Status::NotSupported(
422
+ "Ingesting WriteBatch does not support pipelined_write");
423
+ }
424
+ if (immutable_db_options_.atomic_flush) {
425
+ return Status::NotSupported(
426
+ "Ingesting WriteBatch does not support atomic_flush");
427
+ }
428
+ }
290
429
  // Otherwise IsLatestPersistentState optimization does not make sense
291
430
  assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
292
431
  disable_memtable);
@@ -344,7 +483,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
344
483
  PERF_TIMER_GUARD(write_pre_and_post_process_time);
345
484
  WriteThread::Writer w(write_options, my_batch, callback, user_write_cb,
346
485
  log_ref, disable_memtable, batch_cnt,
347
- pre_release_callback, post_memtable_callback);
486
+ pre_release_callback, post_memtable_callback,
487
+ /*_ingest_wbwi=*/wbwi != nullptr);
348
488
  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
349
489
 
350
490
  write_thread_.JoinBatchGroup(&w);
@@ -441,6 +581,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
441
581
  TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
442
582
  last_batch_group_size_ =
443
583
  write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
584
+ if (wbwi) {
585
+ assert(write_group.size == 1);
586
+ }
444
587
 
445
588
  IOStatus io_s;
446
589
  Status pre_release_cb_status;
@@ -494,10 +637,25 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
494
637
  // Note about seq_per_batch_: either disableWAL is set for the entire write
495
638
  // group or not. In either case we inc seq for each write batch with no
496
639
  // failed callback. This means that there could be a batch with
497
- // disalbe_memtable in between; although we do not write this batch to
640
+ // disable_memtable in between; although we do not write this batch to
498
641
  // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
499
642
  // the seq per valid written key to mem.
500
643
  size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
644
+ if (wbwi) {
645
+ // Reserve sequence numbers for the ingested memtable. We need to reserve
646
+ // at lease this amount for recovery. During recovery,
647
+ // transactions do not commit by ingesting WBWI. The sequence number
648
+ // associated with the commit entry in WAL is used as the starting
649
+ // sequence number for inserting into memtable. We need to reserve
650
+ // enough sequence numbers here (at least the number of operations
651
+ // in write batch) to assign to memtable entries for this transaction.
652
+ // This prevents updates in different transactions from using out-of-order
653
+ // sequence numbers or the same key+seqno.
654
+ //
655
+ // WBWI ingestion requires not grouping writes, so we don't need to
656
+ // consider incrementing sequence number for WBWI from other writers.
657
+ seq_inc += wbwi->GetWriteBatch()->Count();
658
+ }
501
659
 
502
660
  const bool concurrent_update = two_write_queues_;
503
661
  // Update stats while we are an exclusive group leader, so we know
@@ -674,6 +832,27 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
674
832
  // handle exit, false means somebody else did
675
833
  should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
676
834
  }
835
+ if (wbwi) {
836
+ if (status.ok() && w.status.ok()) {
837
+ // w.batch contains (potentially empty) commit time batch updates,
838
+ // only ingest wbwi if w.batch is applied to memtable successfully
839
+ assert(wbwi->GetWriteBatch()->Count() > 0);
840
+
841
+ uint32_t memtable_update_count = w.batch->Count();
842
+ SequenceNumber lb = versions_->LastSequence() + memtable_update_count + 1;
843
+ SequenceNumber ub = versions_->LastSequence() + memtable_update_count +
844
+ wbwi->GetWriteBatch()->Count();
845
+ assert(ub == last_sequence);
846
+ if (two_write_queues_) {
847
+ assert(ub <= versions_->LastAllocatedSequence());
848
+ }
849
+ status = IngestWBWI(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
850
+ prep_log, last_sequence,
851
+ /*memtable_updated=*/memtable_update_count > 0,
852
+ write_options.ignore_missing_column_families);
853
+ }
854
+ }
855
+
677
856
  if (should_exit_batch_group) {
678
857
  if (status.ok()) {
679
858
  for (auto* tmp_w : write_group) {
@@ -687,7 +866,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
687
866
  }
688
867
  }
689
868
  // Note: if we are to resume after non-OK statuses we need to revisit how
690
- // we reacts to non-OK statuses here.
869
+ // we react to non-OK statuses here.
691
870
  versions_->SetLastSequence(last_sequence);
692
871
  }
693
872
  MemTableInsertStatusCheck(w.status);
@@ -735,17 +914,6 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
735
914
  size_t total_byte_size = 0;
736
915
 
737
916
  if (w.status.ok()) {
738
- // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
739
- // grabs but does not seem thread-safe.
740
- if (tracer_) {
741
- InstrumentedMutexLock lock(&trace_mutex_);
742
- if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
743
- for (auto* writer : wal_write_group) {
744
- // TODO: maybe handle the tracing status?
745
- tracer_->Write(writer->batch).PermitUncheckedError();
746
- }
747
- }
748
- }
749
917
  SequenceNumber next_sequence = current_sequence;
750
918
  for (auto* writer : wal_write_group) {
751
919
  assert(writer);
@@ -760,6 +928,22 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
760
928
  }
761
929
  }
762
930
  }
931
+ // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
932
+ // grabs but does not seem thread-safe.
933
+ if (tracer_) {
934
+ InstrumentedMutexLock lock(&trace_mutex_);
935
+ if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
936
+ for (auto* writer : wal_write_group) {
937
+ if (writer->CallbackFailed()) {
938
+ // When optimisitc txn conflict checking fails, we should
939
+ // not record to trace.
940
+ continue;
941
+ }
942
+ // TODO: maybe handle the tracing status?
943
+ tracer_->Write(writer->batch).PermitUncheckedError();
944
+ }
945
+ }
946
+ }
763
947
  if (w.disable_wal) {
764
948
  has_unpersisted_data_.store(true, std::memory_order_relaxed);
765
949
  }
@@ -1005,19 +1189,6 @@ Status DBImpl::WriteImplWALOnly(
1005
1189
  WriteThread::WriteGroup write_group;
1006
1190
  uint64_t last_sequence;
1007
1191
  write_thread->EnterAsBatchGroupLeader(&w, &write_group);
1008
- // Note: no need to update last_batch_group_size_ here since the batch writes
1009
- // to WAL only
1010
- // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
1011
- // grabs but does not seem thread-safe.
1012
- if (tracer_) {
1013
- InstrumentedMutexLock lock(&trace_mutex_);
1014
- if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
1015
- for (auto* writer : write_group) {
1016
- // TODO: maybe handle the tracing status?
1017
- tracer_->Write(writer->batch).PermitUncheckedError();
1018
- }
1019
- }
1020
- }
1021
1192
 
1022
1193
  size_t pre_release_callback_cnt = 0;
1023
1194
  size_t total_byte_size = 0;
@@ -1032,6 +1203,23 @@ Status DBImpl::WriteImplWALOnly(
1032
1203
  }
1033
1204
  }
1034
1205
 
1206
+ // Note: no need to update last_batch_group_size_ here since the batch writes
1207
+ // to WAL only
1208
+ // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
1209
+ // grabs but does not seem thread-safe.
1210
+ if (tracer_) {
1211
+ InstrumentedMutexLock lock(&trace_mutex_);
1212
+ if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
1213
+ for (auto* writer : write_group) {
1214
+ if (writer->CallbackFailed()) {
1215
+ continue;
1216
+ }
1217
+ // TODO: maybe handle the tracing status?
1218
+ tracer_->Write(writer->batch).PermitUncheckedError();
1219
+ }
1220
+ }
1221
+ }
1222
+
1035
1223
  const bool concurrent_update = true;
1036
1224
  // Update stats while we are an exclusive group leader, so we know
1037
1225
  // that nobody else can be writing to these particular stats.
@@ -1201,7 +1389,6 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) {
1201
1389
  if (!status.ok()) {
1202
1390
  mutex_.Lock();
1203
1391
  assert(!error_handler_.IsBGWorkStopped());
1204
- // Maybe change the return status to void?
1205
1392
  error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
1206
1393
  mutex_.Unlock();
1207
1394
  }
@@ -1601,6 +1788,8 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
1601
1788
  Status DBImpl::WriteRecoverableState() {
1602
1789
  mutex_.AssertHeld();
1603
1790
  if (!cached_recoverable_state_empty_) {
1791
+ // Only for write-prepared and write-unprepared.
1792
+ assert(seq_per_batch_);
1604
1793
  bool dont_care_bool;
1605
1794
  SequenceNumber next_seq;
1606
1795
  if (two_write_queues_) {
@@ -2193,16 +2382,13 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
2193
2382
  mutex_.Lock();
2194
2383
  }
2195
2384
 
2196
- // REQUIRES: mutex_ is held
2197
- // REQUIRES: this thread is currently at the front of the writer queue
2198
- // REQUIRES: this thread is currently at the front of the 2nd writer queue if
2199
- // two_write_queues_ is true (This is to simplify the reasoning.)
2200
- Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2385
+ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2386
+ ReadOnlyMemTable* new_imm,
2387
+ SequenceNumber last_seqno) {
2201
2388
  mutex_.AssertHeld();
2202
2389
  assert(lock_wal_count_ == 0);
2203
2390
 
2204
2391
  // TODO: plumb Env::IOActivity, Env::IOPriority
2205
- const ReadOptions read_options;
2206
2392
  const WriteOptions write_options;
2207
2393
 
2208
2394
  log::Writer* new_log = nullptr;
@@ -2238,12 +2424,13 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2238
2424
  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
2239
2425
 
2240
2426
  // Set memtable_info for memtable sealed callback
2427
+ // TODO: memtable_info for `new_imm`
2241
2428
  MemTableInfo memtable_info;
2242
2429
  memtable_info.cf_name = cfd->GetName();
2243
2430
  memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
2244
2431
  memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
2245
- memtable_info.num_entries = cfd->mem()->num_entries();
2246
- memtable_info.num_deletes = cfd->mem()->num_deletes();
2432
+ memtable_info.num_entries = cfd->mem()->NumEntries();
2433
+ memtable_info.num_deletes = cfd->mem()->NumDeletion();
2247
2434
  if (!cfd->ioptions()->persist_user_defined_timestamps &&
2248
2435
  cfd->user_comparator()->timestamp_size() > 0) {
2249
2436
  const Slice& newest_udt = cfd->mem()->GetNewestUDT();
@@ -2265,8 +2452,20 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2265
2452
  }
2266
2453
  }
2267
2454
  if (s.ok()) {
2268
- SequenceNumber seq = versions_->LastSequence();
2269
- new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
2455
+ // FIXME: from the comment for GetEarliestSequenceNumber(), any key with
2456
+ // seqno >= earliest_seqno should be in this or later memtable. This means
2457
+ // we should use LastSequence() + 1 or last_seqno + 1 here. And it needs to
2458
+ // be incremented with file ingestion and other operations that consumes
2459
+ // sequence number.
2460
+ SequenceNumber seq;
2461
+ if (new_imm) {
2462
+ assert(last_seqno > versions_->LastSequence());
2463
+ seq = last_seqno;
2464
+ } else {
2465
+ seq = versions_->LastSequence();
2466
+ }
2467
+ new_mem =
2468
+ cfd->ConstructNewMemtable(mutable_cf_options, /*earliest_seq=*/seq);
2270
2469
  context->superversion_context.NewSuperVersion();
2271
2470
 
2272
2471
  ROCKS_LOG_INFO(immutable_db_options_.info_log,
@@ -2348,6 +2547,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2348
2547
  versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
2349
2548
  if (min_wal_number_to_keep >
2350
2549
  versions_->GetWalSet().GetMinWalNumberToKeep()) {
2550
+ // TODO: plumb Env::IOActivity, Env::IOPriority
2551
+ const ReadOptions read_options;
2351
2552
  // Get a snapshot of the empty column families.
2352
2553
  // LogAndApply may release and reacquire db
2353
2554
  // mutex, during that period, column family may become empty (e.g. its
@@ -2405,6 +2606,18 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2405
2606
  cfd->mem()->SetNextLogNumber(logfile_number_);
2406
2607
  assert(new_mem != nullptr);
2407
2608
  cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
2609
+ if (new_imm) {
2610
+ // Need to assign memtable id here before SetMemtable() below assigns id to
2611
+ // the new live memtable
2612
+ cfd->AssignMemtableID(new_imm);
2613
+ // NOTE: new_imm and cfd->mem() references the same WAL and has the same
2614
+ // NextLogNumber(). They should be flushed together. For non-atomic-flush,
2615
+ // we always try to flush all immutable memtable. For atomic flush, these
2616
+ // two memtables will be marked eligible for flush in the same call to
2617
+ // AssignAtomicFlushSeq().
2618
+ new_imm->SetNextLogNumber(logfile_number_);
2619
+ cfd->imm()->Add(new_imm, &context->memtables_to_free_);
2620
+ }
2408
2621
  new_mem->Ref();
2409
2622
  cfd->SetMemtable(new_mem);
2410
2623
  InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
@@ -2417,6 +2630,9 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2417
2630
  // that is okay. If we did, it most likely means that s was already an error.
2418
2631
  // In any case, ignore any unchecked error for i_os here.
2419
2632
  io_s.PermitUncheckedError();
2633
+ // We guarantee that if a non-ok status is returned, `new_imm` was not added
2634
+ // to the db.
2635
+ assert(s.ok());
2420
2636
  return s;
2421
2637
  }
2422
2638