@nxtedition/rocksdb 13.1.4 → 13.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/binding.cc +43 -16
  2. package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
  4. package/deps/rocksdb/rocksdb/Makefile +2 -2
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
  7. package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
  8. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
  9. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
  10. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
  11. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
  12. package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
  13. package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
  14. package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
  15. package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
  16. package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
  39. package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
  40. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
  41. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
  42. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
  52. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
  53. package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
  54. package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
  55. package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
  56. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
  57. package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
  58. package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
  59. package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
  60. package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
  61. package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
  62. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  63. package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
  65. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
  66. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
  67. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
  68. package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
  69. package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
  70. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
  71. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
  72. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
  73. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
  75. package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
  76. package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
  77. package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
  78. package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
  79. package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
  80. package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
  81. package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
  82. package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
  83. package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
  84. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
  85. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
  86. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
  87. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
  88. package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
  89. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
  90. package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
  91. package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
  92. package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
  93. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
  94. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  95. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
  96. package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
  97. package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
  98. package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
  99. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
  100. package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
  101. package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
  102. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
  103. package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
  104. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
  105. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
  106. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
  107. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
  108. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
  109. package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
  110. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
  111. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
  113. package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
  114. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
  115. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
  118. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
  119. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
  120. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
  121. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
  122. package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
  123. package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
  124. package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
  125. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  126. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
  127. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
  128. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
  129. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
  130. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
  131. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
  132. package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
  134. package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
  135. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
  136. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
  137. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
  139. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
  140. package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
  142. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
  143. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
  144. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
  145. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
  147. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
  148. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
  149. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
  150. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
  151. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
  152. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  153. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
  154. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
  155. package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
  156. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
  157. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
  158. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
  159. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
  160. package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
  161. package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
  162. package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
  163. package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
  164. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
  165. package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
  166. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
  167. package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
  168. package/deps/rocksdb/rocksdb/port/port.h +5 -9
  169. package/deps/rocksdb/rocksdb/src.mk +8 -0
  170. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
  171. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
  172. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
  173. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
  174. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
  175. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
  176. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
  177. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
  178. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
  179. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
  180. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
  181. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
  182. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  183. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
  184. package/deps/rocksdb/rocksdb/table/format.cc +3 -3
  185. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
  186. package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
  187. package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
  188. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
  189. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  190. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
  191. package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
  192. package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
  193. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
  194. package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
  195. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
  196. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
  197. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
  198. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
  199. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
  200. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
  201. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
  202. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
  203. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
  204. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
  205. package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
  206. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
  207. package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
  208. package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
  209. package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
  210. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
  211. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
  212. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
  213. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
  214. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
  215. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
  216. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
  217. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
  218. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
  219. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
  220. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
  221. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
  222. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
  223. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
  224. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
  225. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
  226. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
  227. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
  228. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
  229. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
  230. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
  231. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
  232. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
  233. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
  234. package/deps/rocksdb/rocksdb.gyp +2 -0
  235. package/package.json +1 -1
  236. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  237. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -18,6 +18,7 @@
18
18
 
19
19
  #include "db/dbformat.h"
20
20
  #include "db/kv_checksum.h"
21
+ #include "db/merge_helper.h"
21
22
  #include "db/range_tombstone_fragmenter.h"
22
23
  #include "db/read_callback.h"
23
24
  #include "db/seqno_to_time_mapping.h"
@@ -76,88 +77,48 @@ struct MemTablePostProcessInfo {
76
77
  };
77
78
 
78
79
  using MultiGetRange = MultiGetContext::Range;
79
- // Note: Many of the methods in this class have comments indicating that
80
+
81
+ // For each CF, rocksdb maintains an active memtable that accept writes,
82
+ // and zero or more sealed memtables that we call immutable memtables.
83
+ // This interface contains all methods required for immutable memtables.
84
+ // MemTable class inherit from `ReadOnlyMemTable` and implements additional
85
+ // methods required for active memtables.
86
+ // Immutable memtable list (MemTableList) maintains a list of ReadOnlyMemTable
87
+ // objects. This interface enables feature like direct ingestion of an
88
+ // immutable memtable with custom implementation, bypassing memtable writes.
89
+ //
90
+ // Note: Many of the methods in this class have comments indicating that
80
91
  // external synchronization is required as these methods are not thread-safe.
81
92
  // It is up to higher layers of code to decide how to prevent concurrent
82
- // invocation of these methods. This is usually done by acquiring either
93
+ // invocation of these methods. This is usually done by acquiring either
83
94
  // the db mutex or the single writer thread.
84
95
  //
85
96
  // Some of these methods are documented to only require external
86
- // synchronization if this memtable is immutable. Calling MarkImmutable() is
97
+ // synchronization if this memtable is immutable. Calling MarkImmutable() is
87
98
  // not sufficient to guarantee immutability. It is up to higher layers of
88
99
  // code to determine if this MemTable can still be modified by other threads.
89
100
  // Eg: The Superversion stores a pointer to the current MemTable (that can
90
101
  // be modified) and a separate list of the MemTables that can no longer be
91
102
  // written to (aka the 'immutable memtables').
92
- class MemTable {
103
+ //
104
+ // MemTables are reference counted. The initial reference count
105
+ // is zero and the caller must call Ref() at least once.
106
+ class ReadOnlyMemTable {
93
107
  public:
94
- struct KeyComparator : public MemTableRep::KeyComparator {
95
- const InternalKeyComparator comparator;
96
- explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
97
- int operator()(const char* prefix_len_key1,
98
- const char* prefix_len_key2) const override;
99
- int operator()(const char* prefix_len_key,
100
- const DecodedType& key) const override;
101
- };
102
-
103
- // MemTables are reference counted. The initial reference count
104
- // is zero and the caller must call Ref() at least once.
105
- //
106
- // earliest_seq should be the current SequenceNumber in the db such that any
107
- // key inserted into this memtable will have an equal or larger seq number.
108
- // (When a db is first created, the earliest sequence number will be 0).
109
- // If the earliest sequence number is not known, kMaxSequenceNumber may be
110
- // used, but this may prevent some transactions from succeeding until the
111
- // first key is inserted into the memtable.
112
- explicit MemTable(const InternalKeyComparator& comparator,
113
- const ImmutableOptions& ioptions,
114
- const MutableCFOptions& mutable_cf_options,
115
- WriteBufferManager* write_buffer_manager,
116
- SequenceNumber earliest_seq, uint32_t column_family_id);
117
- // No copying allowed
118
- MemTable(const MemTable&) = delete;
119
- MemTable& operator=(const MemTable&) = delete;
120
-
121
108
  // Do not delete this MemTable unless Unref() indicates it not in use.
122
- ~MemTable();
109
+ virtual ~ReadOnlyMemTable() = default;
123
110
 
124
- // Increase reference count.
125
- // REQUIRES: external synchronization to prevent simultaneous
126
- // operations on the same MemTable.
127
- void Ref() { ++refs_; }
128
-
129
- // Drop reference count.
130
- // If the refcount goes to zero return this memtable, otherwise return null.
131
- // REQUIRES: external synchronization to prevent simultaneous
132
- // operations on the same MemTable.
133
- MemTable* Unref() {
134
- --refs_;
135
- assert(refs_ >= 0);
136
- if (refs_ <= 0) {
137
- return this;
138
- }
139
- return nullptr;
140
- }
111
+ virtual const char* Name() const = 0;
141
112
 
142
113
  // Returns an estimate of the number of bytes of data in use by this
143
114
  // data structure.
144
115
  //
145
116
  // REQUIRES: external synchronization to prevent simultaneous
146
117
  // operations on the same MemTable (unless this Memtable is immutable).
147
- size_t ApproximateMemoryUsage();
148
-
149
- // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
150
- // require external synchronization. The value may be less accurate though
151
- size_t ApproximateMemoryUsageFast() const {
152
- return approximate_memory_usage_.load(std::memory_order_relaxed);
153
- }
118
+ virtual size_t ApproximateMemoryUsage() = 0;
154
119
 
155
120
  // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast
156
- size_t MemoryAllocatedBytes() const {
157
- return table_->ApproximateMemoryUsage() +
158
- range_del_table_->ApproximateMemoryUsage() +
159
- arena_.MemoryAllocatedBytes();
160
- }
121
+ virtual size_t MemoryAllocatedBytes() const = 0;
161
122
 
162
123
  // Returns a vector of unique random memtable entries of size 'sample_size'.
163
124
  //
@@ -172,27 +133,8 @@ class MemTable {
172
133
  // REQUIRES: SkipList memtable representation. This function is not
173
134
  // implemented for any other type of memtable representation (vectorrep,
174
135
  // hashskiplist,...).
175
- void UniqueRandomSample(const uint64_t& target_sample_size,
176
- std::unordered_set<const char*>* entries) {
177
- // TODO(bjlemaire): at the moment, only supported by skiplistrep.
178
- // Extend it to all other memtable representations.
179
- table_->UniqueRandomSample(num_entries(), target_sample_size, entries);
180
- }
181
-
182
- // This method heuristically determines if the memtable should continue to
183
- // host more data.
184
- bool ShouldScheduleFlush() const {
185
- return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
186
- }
187
-
188
- // Returns true if a flush should be scheduled and the caller should
189
- // be the one to schedule it
190
- bool MarkFlushScheduled() {
191
- auto before = FLUSH_REQUESTED;
192
- return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
193
- std::memory_order_relaxed,
194
- std::memory_order_relaxed);
195
- }
136
+ virtual void UniqueRandomSample(const uint64_t& target_sample_size,
137
+ std::unordered_set<const char*>* entries) = 0;
196
138
 
197
139
  // Return an iterator that yields the contents of the memtable.
198
140
  //
@@ -208,10 +150,18 @@ class MemTable {
208
150
  // those allocated in arena.
209
151
  // seqno_to_time_mapping: it's used to support return write unix time for the
210
152
  // data, currently only needed for iterators serving user reads.
211
- InternalIterator* NewIterator(
153
+ virtual InternalIterator* NewIterator(
212
154
  const ReadOptions& read_options,
213
155
  UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
214
- const SliceTransform* prefix_extractor);
156
+ const SliceTransform* prefix_extractor, bool for_flush) = 0;
157
+
158
+ // Returns an iterator that wraps a MemTableIterator and logically strips the
159
+ // user-defined timestamp of each key. This API is only used by flush when
160
+ // user-defined timestamps in MemTable only feature is enabled.
161
+ virtual InternalIterator* NewTimestampStrippingIterator(
162
+ const ReadOptions& read_options,
163
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
164
+ const SliceTransform* prefix_extractor, size_t ts_sz) = 0;
215
165
 
216
166
  // Returns an iterator that yields the range tombstones of the memtable.
217
167
  // The caller must ensure that the underlying MemTable remains live
@@ -223,31 +173,23 @@ class MemTable {
223
173
  // is constructed when a memtable becomes immutable. Setting the flag to false
224
174
  // will always yield correct result, but may incur performance penalty as it
225
175
  // always creates a new fragmented range tombstone list.
226
- FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
176
+ virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
227
177
  const ReadOptions& read_options, SequenceNumber read_seq,
228
- bool immutable_memtable);
178
+ bool immutable_memtable) = 0;
229
179
 
230
- Status VerifyEncodedEntry(Slice encoded,
231
- const ProtectionInfoKVOS64& kv_prot_info);
232
-
233
- // Add an entry into memtable that maps key to value at the
234
- // specified sequence number and with the specified type.
235
- // Typically value will be empty if type==kTypeDeletion.
236
- //
237
- // REQUIRES: if allow_concurrent = false, external synchronization to prevent
238
- // simultaneous operations on the same MemTable.
239
- //
240
- // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
241
- // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
242
- // The next attempt should try a larger value for `seq`.
243
- Status Add(SequenceNumber seq, ValueType type, const Slice& key,
244
- const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
245
- bool allow_concurrent = false,
246
- MemTablePostProcessInfo* post_process_info = nullptr,
247
- void** hint = nullptr);
180
+ // Returns an iterator that yields the range tombstones of the memtable and
181
+ // logically strips the user-defined timestamp of each key (including start
182
+ // key, and end key). This API is only used by flush when user-defined
183
+ // timestamps in MemTable only feature is enabled.
184
+ virtual FragmentedRangeTombstoneIterator*
185
+ NewTimestampStrippingRangeTombstoneIterator(const ReadOptions& read_options,
186
+ SequenceNumber read_seq,
187
+ size_t ts_sz) = 0;
248
188
 
249
189
  // Used to Get value associated with key or Get Merge Operands associated
250
190
  // with key.
191
+ // Keys are considered if they are no larger than the parameter `key` in
192
+ // the order defined by comparator and share the save user key with `key`.
251
193
  // If do_merge = true the default behavior which is Get value for key is
252
194
  // executed. Expected behavior is described right below.
253
195
  // If memtable contains a value for key, store it in *value and return true.
@@ -276,14 +218,13 @@ class MemTable {
276
218
  // @param immutable_memtable Whether this memtable is immutable. Used
277
219
  // internally by NewRangeTombstoneIterator(). See comment above
278
220
  // NewRangeTombstoneIterator() for more detail.
279
- bool Get(const LookupKey& key, std::string* value,
280
- PinnableWideColumns* columns, std::string* timestamp, Status* s,
281
- MergeContext* merge_context,
282
- SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
283
- const ReadOptions& read_opts, bool immutable_memtable,
284
- ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
285
- bool do_merge = true);
286
-
221
+ virtual bool Get(const LookupKey& key, std::string* value,
222
+ PinnableWideColumns* columns, std::string* timestamp,
223
+ Status* s, MergeContext* merge_context,
224
+ SequenceNumber* max_covering_tombstone_seq,
225
+ SequenceNumber* seq, const ReadOptions& read_opts,
226
+ bool immutable_memtable, ReadCallback* callback = nullptr,
227
+ bool* is_blob_index = nullptr, bool do_merge = true) = 0;
287
228
  bool Get(const LookupKey& key, std::string* value,
288
229
  PinnableWideColumns* columns, std::string* timestamp, Status* s,
289
230
  MergeContext* merge_context,
@@ -300,8 +241,351 @@ class MemTable {
300
241
  // @param immutable_memtable Whether this memtable is immutable. Used
301
242
  // internally by NewRangeTombstoneIterator(). See comment above
302
243
  // NewRangeTombstoneIterator() for more detail.
244
+ virtual void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
245
+ ReadCallback* callback, bool immutable_memtable) = 0;
246
+
247
+ // Get total number of entries in the mem table.
248
+ // REQUIRES: external synchronization to prevent simultaneous
249
+ // operations on the same MemTable (unless this Memtable is immutable).
250
+ virtual uint64_t NumEntries() const = 0;
251
+
252
+ // Get total number of point deletes in the mem table.
253
+ // REQUIRES: external synchronization to prevent simultaneous
254
+ // operations on the same MemTable (unless this Memtable is immutable).
255
+ virtual uint64_t NumDeletion() const = 0;
256
+
257
+ // Get total number of range deletions in the mem table.
258
+ // REQUIRES: external synchronization to prevent simultaneous
259
+ // operations on the same MemTable (unless this Memtable is immutable).
260
+ virtual uint64_t NumRangeDeletion() const = 0;
261
+
262
+ virtual uint64_t GetDataSize() const = 0;
263
+
264
+ // Returns the sequence number of the first element that was inserted
265
+ // into the memtable.
266
+ // REQUIRES: external synchronization to prevent simultaneous
267
+ // operations on the same MemTable (unless this Memtable is immutable).
268
+ virtual SequenceNumber GetFirstSequenceNumber() = 0;
269
+
270
+ // Returns if there is no entry inserted to the mem table.
271
+ // REQUIRES: external synchronization to prevent simultaneous
272
+ // operations on the same MemTable (unless this Memtable is immutable).
273
+ virtual bool IsEmpty() const = 0;
274
+
275
+ // Returns the sequence number that is guaranteed to be smaller than or equal
276
+ // to the sequence number of any key that could be inserted into this
277
+ // memtable. It can then be assumed that any write with a larger(or equal)
278
+ // sequence number will be present in this memtable or a later memtable.
279
+ //
280
+ // If the earliest sequence number could not be determined,
281
+ // kMaxSequenceNumber will be returned.
282
+ virtual SequenceNumber GetEarliestSequenceNumber() = 0;
283
+
284
+ virtual uint64_t GetMinLogContainingPrepSection() = 0;
285
+
286
+ // Notify the underlying storage that no more items will be added.
287
+ // REQUIRES: external synchronization to prevent simultaneous
288
+ // operations on the same MemTable.
289
+ // After MarkImmutable() is called, you should not attempt to
290
+ // write anything to this MemTable(). (Ie. do not call Add() or Update()).
291
+ virtual void MarkImmutable() = 0;
292
+
293
+ // Notify the underlying storage that all data it contained has been
294
+ // persisted.
295
+ // REQUIRES: external synchronization to prevent simultaneous
296
+ // operations on the same MemTable.
297
+ virtual void MarkFlushed() = 0;
298
+
299
+ struct MemTableStats {
300
+ uint64_t size;
301
+ uint64_t count;
302
+ };
303
+ virtual MemTableStats ApproximateStats(const Slice& start_ikey,
304
+ const Slice& end_ikey) = 0;
305
+
306
+ virtual const InternalKeyComparator& GetInternalKeyComparator() const = 0;
307
+
308
+ virtual uint64_t ApproximateOldestKeyTime() const = 0;
309
+
310
+ // Returns whether a fragmented range tombstone list is already constructed
311
+ // for this memtable. It should be constructed right before a memtable is
312
+ // added to an immutable memtable list. Note that if a memtable does not have
313
+ // any range tombstone, then no range tombstone list will ever be constructed
314
+ // and true is returned in that case.
315
+ virtual bool IsFragmentedRangeTombstonesConstructed() const = 0;
316
+
317
+ // Get the newest user-defined timestamp contained in this MemTable. Check
318
+ // `newest_udt_` for what newer means. This method should only be invoked for
319
+ // an MemTable that has enabled user-defined timestamp feature and set
320
+ // `persist_user_defined_timestamps` to false. The tracked newest UDT will be
321
+ // used by flush job in the background to help check the MemTable's
322
+ // eligibility for Flush.
323
+ virtual const Slice& GetNewestUDT() const = 0;
324
+
325
+ // Increase reference count.
326
+ // REQUIRES: external synchronization to prevent simultaneous
327
+ // operations on the same MemTable.
328
+ void Ref() { ++refs_; }
329
+
330
+ // Drop reference count.
331
+ // If the refcount goes to zero return this memtable, otherwise return null.
332
+ // REQUIRES: external synchronization to prevent simultaneous
333
+ // operations on the same MemTable.
334
+ ReadOnlyMemTable* Unref() {
335
+ --refs_;
336
+ assert(refs_ >= 0);
337
+ if (refs_ <= 0) {
338
+ return this;
339
+ }
340
+ return nullptr;
341
+ }
342
+
343
+ // Returns the edits area that is needed for flushing the memtable
344
+ VersionEdit* GetEdits() { return &edit_; }
345
+
346
+ // Returns the next active logfile number when this memtable is about to
347
+ // be flushed to storage
348
+ // REQUIRES: external synchronization to prevent simultaneous
349
+ // operations on the same MemTable.
350
+ uint64_t GetNextLogNumber() const { return mem_next_logfile_number_; }
351
+
352
+ // Sets the next active logfile number when this memtable is about to
353
+ // be flushed to storage
354
+ // REQUIRES: external synchronization to prevent simultaneous
355
+ // operations on the same MemTable.
356
+ void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
357
+
358
+ // REQUIRES: db_mutex held.
359
+ void SetID(uint64_t id) { id_ = id; }
360
+
361
+ uint64_t GetID() const { return id_; }
362
+
363
+ void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
364
+
365
+ uint64_t GetFileNumber() const { return file_number_; }
366
+
367
+ void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
368
+
369
+ void SetFlushInProgress(bool in_progress) {
370
+ flush_in_progress_ = in_progress;
371
+ }
372
+
373
+ void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
374
+ flush_job_info_ = std::move(info);
375
+ }
376
+
377
+ std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
378
+ return std::move(flush_job_info_);
379
+ }
380
+
381
+ static void HandleTypeValue(
382
+ const Slice& lookup_user_key, const Slice& value, bool value_pinned,
383
+ bool do_merge, bool merge_in_progress, MergeContext* merge_context,
384
+ const MergeOperator* merge_operator, SystemClock* clock,
385
+ Statistics* statistics, Logger* info_log, Status* s,
386
+ std::string* out_value, PinnableWideColumns* out_columns,
387
+ bool* is_blob_index) {
388
+ *s = Status::OK();
389
+
390
+ if (!do_merge) {
391
+ // Preserve the value with the goal of returning it as part of
392
+ // raw merge operands to the user
393
+ // TODO(yanqin) update MergeContext so that timestamps information
394
+ // can also be retained.
395
+ merge_context->PushOperand(value, value_pinned);
396
+ } else if (merge_in_progress) {
397
+ assert(do_merge);
398
+ // `op_failure_scope` (an output parameter) is not provided (set to
399
+ // nullptr) since a failure must be propagated regardless of its
400
+ // value.
401
+ if (out_value || out_columns) {
402
+ *s = MergeHelper::TimedFullMerge(
403
+ merge_operator, lookup_user_key, MergeHelper::kPlainBaseValue,
404
+ value, merge_context->GetOperands(), info_log, statistics, clock,
405
+ /* update_num_ops_stats */ true,
406
+ /* op_failure_scope */ nullptr, out_value, out_columns);
407
+ }
408
+ } else if (out_value) {
409
+ out_value->assign(value.data(), value.size());
410
+ } else if (out_columns) {
411
+ out_columns->SetPlainValue(value);
412
+ }
413
+
414
+ if (is_blob_index) {
415
+ *is_blob_index = false;
416
+ }
417
+ }
418
+
419
+ static void HandleTypeDeletion(
420
+ const Slice& lookup_user_key, bool merge_in_progress,
421
+ MergeContext* merge_context, const MergeOperator* merge_operator,
422
+ SystemClock* clock, Statistics* statistics, Logger* logger, Status* s,
423
+ std::string* out_value, PinnableWideColumns* out_columns) {
424
+ if (merge_in_progress) {
425
+ if (out_value || out_columns) {
426
+ // `op_failure_scope` (an output parameter) is not provided (set to
427
+ // nullptr) since a failure must be propagated regardless of its
428
+ // value.
429
+ *s = MergeHelper::TimedFullMerge(
430
+ merge_operator, lookup_user_key, MergeHelper::kNoBaseValue,
431
+ merge_context->GetOperands(), logger, statistics, clock,
432
+ /* update_num_ops_stats */ true,
433
+ /* op_failure_scope */ nullptr, out_value, out_columns);
434
+ } else {
435
+ // We have found a final value (a base deletion) and have newer
436
+ // merge operands that we do not intend to merge. Nothing remains
437
+ // to be done so assign status to OK.
438
+ *s = Status::OK();
439
+ }
440
+ } else {
441
+ *s = Status::NotFound();
442
+ }
443
+ }
444
+
445
+ protected:
446
+ friend class MemTableList;
447
+
448
+ int refs_{0};
449
+
450
+ // These are used to manage memtable flushes to storage
451
+ bool flush_in_progress_{false}; // started the flush
452
+ bool flush_completed_{false}; // finished the flush
453
+ uint64_t file_number_{0};
454
+
455
+ // The updates to be applied to the transaction log when this
456
+ // memtable is flushed to storage.
457
+ VersionEdit edit_;
458
+
459
+ // The log files earlier than this number can be deleted.
460
+ uint64_t mem_next_logfile_number_{0};
461
+
462
+ // Memtable id to track flush.
463
+ uint64_t id_ = 0;
464
+
465
+ // Sequence number of the atomic flush that is responsible for this memtable.
466
+ // The sequence number of atomic flush is a seq, such that no writes with
467
+ // sequence numbers greater than or equal to seq are flushed, while all
468
+ // writes with sequence number smaller than seq are flushed.
469
+ SequenceNumber atomic_flush_seqno_{kMaxSequenceNumber};
470
+
471
+ // Flush job info of the current memtable.
472
+ std::unique_ptr<FlushJobInfo> flush_job_info_;
473
+ };
474
+
475
+ class MemTable final : public ReadOnlyMemTable {
476
+ public:
477
+ struct KeyComparator final : public MemTableRep::KeyComparator {
478
+ const InternalKeyComparator comparator;
479
+ explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
480
+ int operator()(const char* prefix_len_key1,
481
+ const char* prefix_len_key2) const override;
482
+ int operator()(const char* prefix_len_key,
483
+ const DecodedType& key) const override;
484
+ };
485
+
486
+ // earliest_seq should be the current SequenceNumber in the db such that any
487
+ // key inserted into this memtable will have an equal or larger seq number.
488
+ // (When a db is first created, the earliest sequence number will be 0).
489
+ // If the earliest sequence number is not known, kMaxSequenceNumber may be
490
+ // used, but this may prevent some transactions from succeeding until the
491
+ // first key is inserted into the memtable.
492
+ explicit MemTable(const InternalKeyComparator& comparator,
493
+ const ImmutableOptions& ioptions,
494
+ const MutableCFOptions& mutable_cf_options,
495
+ WriteBufferManager* write_buffer_manager,
496
+ SequenceNumber earliest_seq, uint32_t column_family_id);
497
+ // No copying allowed
498
+ MemTable(const MemTable&) = delete;
499
+ MemTable& operator=(const MemTable&) = delete;
500
+
501
+ ~MemTable() override;
502
+
503
+ const char* Name() const override { return "MemTable"; }
504
+
505
+ size_t ApproximateMemoryUsage() override;
506
+
507
+ // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
508
+ // require external synchronization. The value may be less accurate though
509
+ size_t ApproximateMemoryUsageFast() const {
510
+ return approximate_memory_usage_.load(std::memory_order_relaxed);
511
+ }
512
+
513
+ size_t MemoryAllocatedBytes() const override {
514
+ return table_->ApproximateMemoryUsage() +
515
+ range_del_table_->ApproximateMemoryUsage() +
516
+ arena_.MemoryAllocatedBytes();
517
+ }
518
+
519
+ void UniqueRandomSample(const uint64_t& target_sample_size,
520
+ std::unordered_set<const char*>* entries) override {
521
+ // TODO(bjlemaire): at the moment, only supported by skiplistrep.
522
+ // Extend it to all other memtable representations.
523
+ table_->UniqueRandomSample(NumEntries(), target_sample_size, entries);
524
+ }
525
+
526
+ // This method heuristically determines if the memtable should continue to
527
+ // host more data.
528
+ bool ShouldScheduleFlush() const {
529
+ return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
530
+ }
531
+
532
+ // Returns true if a flush should be scheduled and the caller should
533
+ // be the one to schedule it
534
+ bool MarkFlushScheduled() {
535
+ auto before = FLUSH_REQUESTED;
536
+ return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
537
+ std::memory_order_relaxed,
538
+ std::memory_order_relaxed);
539
+ }
540
+
541
+ InternalIterator* NewIterator(
542
+ const ReadOptions& read_options,
543
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
544
+ const SliceTransform* prefix_extractor, bool for_flush) override;
545
+
546
+ InternalIterator* NewTimestampStrippingIterator(
547
+ const ReadOptions& read_options,
548
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
549
+ const SliceTransform* prefix_extractor, size_t ts_sz) override;
550
+
551
+ FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
552
+ const ReadOptions& read_options, SequenceNumber read_seq,
553
+ bool immutable_memtable) override;
554
+
555
+ FragmentedRangeTombstoneIterator* NewTimestampStrippingRangeTombstoneIterator(
556
+ const ReadOptions& read_options, SequenceNumber read_seq,
557
+ size_t ts_sz) override;
558
+
559
+ Status VerifyEncodedEntry(Slice encoded,
560
+ const ProtectionInfoKVOS64& kv_prot_info);
561
+
562
+ // Add an entry into memtable that maps key to value at the
563
+ // specified sequence number and with the specified type.
564
+ // Typically, value will be empty if type==kTypeDeletion.
565
+ //
566
+ // REQUIRES: if allow_concurrent = false, external synchronization to prevent
567
+ // simultaneous operations on the same MemTable.
568
+ //
569
+ // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
570
+ // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
571
+ // The next attempt should try a larger value for `seq`.
572
+ Status Add(SequenceNumber seq, ValueType type, const Slice& key,
573
+ const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
574
+ bool allow_concurrent = false,
575
+ MemTablePostProcessInfo* post_process_info = nullptr,
576
+ void** hint = nullptr);
577
+
578
+ using ReadOnlyMemTable::Get;
579
+ bool Get(const LookupKey& key, std::string* value,
580
+ PinnableWideColumns* columns, std::string* timestamp, Status* s,
581
+ MergeContext* merge_context,
582
+ SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
583
+ const ReadOptions& read_opts, bool immutable_memtable,
584
+ ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
585
+ bool do_merge = true) override;
586
+
303
587
  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
304
- ReadCallback* callback, bool immutable_memtable);
588
+ ReadCallback* callback, bool immutable_memtable) override;
305
589
 
306
590
  // If `key` exists in current memtable with type value_type and the existing
307
591
  // value is at least as large as the new value, updates it in-place. Otherwise
@@ -357,28 +641,19 @@ class MemTable {
357
641
  UpdateFlushState();
358
642
  }
359
643
 
360
- // Get total number of entries in the mem table.
361
- // REQUIRES: external synchronization to prevent simultaneous
362
- // operations on the same MemTable (unless this Memtable is immutable).
363
- uint64_t num_entries() const {
644
+ uint64_t NumEntries() const override {
364
645
  return num_entries_.load(std::memory_order_relaxed);
365
646
  }
366
647
 
367
- // Get total number of deletes in the mem table.
368
- // REQUIRES: external synchronization to prevent simultaneous
369
- // operations on the same MemTable (unless this Memtable is immutable).
370
- uint64_t num_deletes() const {
648
+ uint64_t NumDeletion() const override {
371
649
  return num_deletes_.load(std::memory_order_relaxed);
372
650
  }
373
651
 
374
- // Get total number of range deletions in the mem table.
375
- // REQUIRES: external synchronization to prevent simultaneous
376
- // operations on the same MemTable (unless this Memtable is immutable).
377
- uint64_t num_range_deletes() const {
652
+ uint64_t NumRangeDeletion() const override {
378
653
  return num_range_deletes_.load(std::memory_order_relaxed);
379
654
  }
380
655
 
381
- uint64_t get_data_size() const {
656
+ uint64_t GetDataSize() const override {
382
657
  return data_size_.load(std::memory_order_relaxed);
383
658
  }
384
659
 
@@ -398,19 +673,9 @@ class MemTable {
398
673
  }
399
674
  }
400
675
 
401
- // Returns the edits area that is needed for flushing the memtable
402
- VersionEdit* GetEdits() { return &edit_; }
403
-
404
- // Returns if there is no entry inserted to the mem table.
405
- // REQUIRES: external synchronization to prevent simultaneous
406
- // operations on the same MemTable (unless this Memtable is immutable).
407
- bool IsEmpty() const { return first_seqno_ == 0; }
676
+ bool IsEmpty() const override { return first_seqno_ == 0; }
408
677
 
409
- // Returns the sequence number of the first element that was inserted
410
- // into the memtable.
411
- // REQUIRES: external synchronization to prevent simultaneous
412
- // operations on the same MemTable (unless this Memtable is immutable).
413
- SequenceNumber GetFirstSequenceNumber() {
678
+ SequenceNumber GetFirstSequenceNumber() override {
414
679
  return first_seqno_.load(std::memory_order_relaxed);
415
680
  }
416
681
 
@@ -422,14 +687,8 @@ class MemTable {
422
687
  return first_seqno_.store(first_seqno, std::memory_order_relaxed);
423
688
  }
424
689
 
425
- // Returns the sequence number that is guaranteed to be smaller than or equal
426
- // to the sequence number of any key that could be inserted into this
427
- // memtable. It can then be assumed that any write with a larger(or equal)
428
- // sequence number will be present in this memtable or a later memtable.
429
- //
430
- // If the earliest sequence number could not be determined,
431
- // kMaxSequenceNumber will be returned.
432
- SequenceNumber GetEarliestSequenceNumber() {
690
+ SequenceNumber GetEarliestSequenceNumber() override {
691
+ // With file ingestion and empty memtable, this seqno needs to be fixed.
433
692
  return earliest_seqno_.load(std::memory_order_relaxed);
434
693
  }
435
694
 
@@ -448,40 +707,18 @@ class MemTable {
448
707
 
449
708
  void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
450
709
 
451
- // Returns the next active logfile number when this memtable is about to
452
- // be flushed to storage
453
- // REQUIRES: external synchronization to prevent simultaneous
454
- // operations on the same MemTable.
455
- uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
456
-
457
- // Sets the next active logfile number when this memtable is about to
458
- // be flushed to storage
459
- // REQUIRES: external synchronization to prevent simultaneous
460
- // operations on the same MemTable.
461
- void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
462
-
463
- // if this memtable contains data from a committed
464
- // two phase transaction we must take note of the
465
- // log which contains that data so we can know
466
- // when to relese that log
710
+ // If this memtable contains data from a committed two phase transaction we
711
+ // must take note of the log which contains that data so we can know when
712
+ // to release that log.
467
713
  void RefLogContainingPrepSection(uint64_t log);
468
- uint64_t GetMinLogContainingPrepSection();
714
+ uint64_t GetMinLogContainingPrepSection() override;
469
715
 
470
- // Notify the underlying storage that no more items will be added.
471
- // REQUIRES: external synchronization to prevent simultaneous
472
- // operations on the same MemTable.
473
- // After MarkImmutable() is called, you should not attempt to
474
- // write anything to this MemTable(). (Ie. do not call Add() or Update()).
475
- void MarkImmutable() {
716
+ void MarkImmutable() override {
476
717
  table_->MarkReadOnly();
477
718
  mem_tracker_.DoneAllocating();
478
719
  }
479
720
 
480
- // Notify the underlying storage that all data it contained has been
481
- // persisted.
482
- // REQUIRES: external synchronization to prevent simultaneous
483
- // operations on the same MemTable.
484
- void MarkFlushed() { table_->MarkFlushed(); }
721
+ void MarkFlushed() override { table_->MarkFlushed(); }
485
722
 
486
723
  // return true if the current MemTableRep supports merge operator.
487
724
  bool IsMergeOperatorSupported() const {
@@ -494,18 +731,13 @@ class MemTable {
494
731
  return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
495
732
  }
496
733
 
497
- struct MemTableStats {
498
- uint64_t size;
499
- uint64_t count;
500
- };
501
-
502
734
  MemTableStats ApproximateStats(const Slice& start_ikey,
503
- const Slice& end_ikey);
735
+ const Slice& end_ikey) override;
504
736
 
505
737
  // Get the lock associated for the key
506
738
  port::RWMutex* GetLock(const Slice& key);
507
739
 
508
- const InternalKeyComparator& GetInternalKeyComparator() const {
740
+ const InternalKeyComparator& GetInternalKeyComparator() const override {
509
741
  return comparator_.comparator;
510
742
  }
511
743
 
@@ -513,33 +745,10 @@ class MemTable {
513
745
  return &moptions_;
514
746
  }
515
747
 
516
- uint64_t ApproximateOldestKeyTime() const {
748
+ uint64_t ApproximateOldestKeyTime() const override {
517
749
  return oldest_key_time_.load(std::memory_order_relaxed);
518
750
  }
519
751
 
520
- // REQUIRES: db_mutex held.
521
- void SetID(uint64_t id) { id_ = id; }
522
-
523
- uint64_t GetID() const { return id_; }
524
-
525
- void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
526
-
527
- uint64_t GetFileNumber() const { return file_number_; }
528
-
529
- void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
530
-
531
- void SetFlushInProgress(bool in_progress) {
532
- flush_in_progress_ = in_progress;
533
- }
534
-
535
- void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
536
- flush_job_info_ = std::move(info);
537
- }
538
-
539
- std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
540
- return std::move(flush_job_info_);
541
- }
542
-
543
752
  // Returns a heuristic flush decision
544
753
  bool ShouldFlushNow();
545
754
 
@@ -550,23 +759,12 @@ class MemTable {
550
759
  // SwitchMemtable() may fail.
551
760
  void ConstructFragmentedRangeTombstones();
552
761
 
553
- // Returns whether a fragmented range tombstone list is already constructed
554
- // for this memtable. It should be constructed right before a memtable is
555
- // added to an immutable memtable list. Note that if a memtable does not have
556
- // any range tombstone, then no range tombstone list will ever be constructed
557
- // and true is returned in that case.
558
- bool IsFragmentedRangeTombstonesConstructed() const {
762
+ bool IsFragmentedRangeTombstonesConstructed() const override {
559
763
  return fragmented_range_tombstone_list_.get() != nullptr ||
560
764
  is_range_del_table_empty_;
561
765
  }
562
766
 
563
- // Get the newest user-defined timestamp contained in this MemTable. Check
564
- // `newest_udt_` for what newer means. This method should only be invoked for
565
- // an MemTable that has enabled user-defined timestamp feature and set
566
- // `persist_user_defined_timestamps` to false. The tracked newest UDT will be
567
- // used by flush job in the background to help check the MemTable's
568
- // eligibility for Flush.
569
- const Slice& GetNewestUDT() const;
767
+ const Slice& GetNewestUDT() const override;
570
768
 
571
769
  // Returns Corruption status if verification fails.
572
770
  static Status VerifyEntryChecksum(const char* entry,
@@ -582,7 +780,6 @@ class MemTable {
582
780
 
583
781
  KeyComparator comparator_;
584
782
  const ImmutableMemTableOptions moptions_;
585
- int refs_;
586
783
  const size_t kArenaBlockSize;
587
784
  AllocTracker mem_tracker_;
588
785
  ConcurrentArena arena_;
@@ -599,15 +796,6 @@ class MemTable {
599
796
  // Dynamically changeable memtable option
600
797
  std::atomic<size_t> write_buffer_size_;
601
798
 
602
- // These are used to manage memtable flushes to storage
603
- bool flush_in_progress_; // started the flush
604
- bool flush_completed_; // finished the flush
605
- uint64_t file_number_; // filled up after flush is complete
606
-
607
- // The updates to be applied to the transaction log when this
608
- // memtable is flushed to storage.
609
- VersionEdit edit_;
610
-
611
799
  // The sequence number of the kv that was inserted first
612
800
  std::atomic<SequenceNumber> first_seqno_;
613
801
 
@@ -617,9 +805,6 @@ class MemTable {
617
805
 
618
806
  SequenceNumber creation_seq_;
619
807
 
620
- // The log files earlier than this number can be deleted.
621
- uint64_t mem_next_logfile_number_;
622
-
623
808
  // the earliest log containing a prepared section
624
809
  // which has been inserted into this memtable.
625
810
  std::atomic<uint64_t> min_prep_log_referenced_;
@@ -643,15 +828,6 @@ class MemTable {
643
828
  // Timestamp of oldest key
644
829
  std::atomic<uint64_t> oldest_key_time_;
645
830
 
646
- // Memtable id to track flush.
647
- uint64_t id_ = 0;
648
-
649
- // Sequence number of the atomic flush that is responsible for this memtable.
650
- // The sequence number of atomic flush is a seq, such that no writes with
651
- // sequence numbers greater than or equal to seq are flushed, while all
652
- // writes with sequence number smaller than seq are flushed.
653
- SequenceNumber atomic_flush_seqno_;
654
-
655
831
  // keep track of memory usage in table_, arena_, and range_del_table_.
656
832
  // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
657
833
  std::atomic<uint64_t> approximate_memory_usage_;
@@ -660,9 +836,6 @@ class MemTable {
660
836
  // unlimited.
661
837
  uint32_t memtable_max_range_deletions_ = 0;
662
838
 
663
- // Flush job info of the current memtable.
664
- std::unique_ptr<FlushJobInfo> flush_job_info_;
665
-
666
839
  // Size in bytes for the user-defined timestamps.
667
840
  size_t ts_sz_;
668
841
 
@@ -704,6 +877,12 @@ class MemTable {
704
877
  std::unique_ptr<FragmentedRangeTombstoneList>
705
878
  fragmented_range_tombstone_list_;
706
879
 
880
+ // The fragmented range tombstone of this memtable with all keys' user-defined
881
+ // timestamps logically stripped. This is constructed and used by flush when
882
+ // user-defined timestamps in memtable only feature is enabled.
883
+ std::unique_ptr<FragmentedRangeTombstoneList>
884
+ timestamp_stripping_fragmented_range_tombstone_list_;
885
+
707
886
  // makes sure there is a single range tombstone writer to invalidate cache
708
887
  std::mutex range_del_mutex_;
709
888
  CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>