@nxtedition/rocksdb 10.1.5 → 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. package/binding.cc +19 -11
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +16 -5
  3. package/deps/rocksdb/rocksdb/Makefile +38 -15
  4. package/deps/rocksdb/rocksdb/TARGETS +10 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +58 -0
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +4 -4
  7. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +4 -2
  8. package/deps/rocksdb/rocksdb/db/builder.cc +2 -2
  9. package/deps/rocksdb/rocksdb/db/builder.h +1 -1
  10. package/deps/rocksdb/rocksdb/db/c.cc +205 -6
  11. package/deps/rocksdb/rocksdb/db/c_test.c +189 -1
  12. package/deps/rocksdb/rocksdb/db/column_family.cc +28 -0
  13. package/deps/rocksdb/rocksdb/db/column_family.h +17 -0
  14. package/deps/rocksdb/rocksdb/db/column_family_test.cc +234 -60
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +8 -1
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +11 -9
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +4 -4
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -0
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +1 -0
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +22 -25
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +2 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +112 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +72 -21
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +2 -0
  25. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +77 -0
  26. package/deps/rocksdb/rocksdb/db/convenience.cc +3 -0
  27. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +269 -112
  28. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +107 -43
  29. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +93 -24
  30. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +5 -5
  31. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +157 -68
  32. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +56 -15
  33. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +78 -105
  34. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +39 -9
  35. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  36. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +21 -14
  37. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +107 -63
  38. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +43 -2
  39. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +4 -0
  40. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +6 -0
  41. package/deps/rocksdb/rocksdb/db/db_test.cc +10 -2
  42. package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
  43. package/deps/rocksdb/rocksdb/db/db_test_util.cc +5 -0
  44. package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -6
  45. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +92 -2
  46. package/deps/rocksdb/rocksdb/db/error_handler.cc +34 -39
  47. package/deps/rocksdb/rocksdb/db/error_handler.h +3 -4
  48. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +8 -4
  49. package/deps/rocksdb/rocksdb/db/event_helpers.cc +6 -3
  50. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +71 -15
  51. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +11 -0
  52. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +383 -4
  53. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +88 -72
  54. package/deps/rocksdb/rocksdb/db/flush_job.cc +30 -3
  55. package/deps/rocksdb/rocksdb/db/flush_job.h +14 -0
  56. package/deps/rocksdb/rocksdb/db/internal_stats.cc +60 -1
  57. package/deps/rocksdb/rocksdb/db/internal_stats.h +20 -1
  58. package/deps/rocksdb/rocksdb/db/log_writer.cc +24 -0
  59. package/deps/rocksdb/rocksdb/db/log_writer.h +5 -0
  60. package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
  61. package/deps/rocksdb/rocksdb/db/memtable.h +10 -10
  62. package/deps/rocksdb/rocksdb/db/memtable_list.cc +4 -4
  63. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +10 -3
  64. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +8 -10
  65. package/deps/rocksdb/rocksdb/db/repair.cc +4 -3
  66. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +30 -0
  67. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +9 -0
  68. package/deps/rocksdb/rocksdb/db/table_cache.cc +17 -2
  69. package/deps/rocksdb/rocksdb/db/table_cache.h +9 -1
  70. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +9 -2
  71. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +3 -1
  72. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +3 -3
  73. package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +7 -7
  74. package/deps/rocksdb/rocksdb/db/version_edit.cc +0 -1
  75. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -6
  76. package/deps/rocksdb/rocksdb/db/version_set.cc +54 -31
  77. package/deps/rocksdb/rocksdb/db/version_set.h +14 -7
  78. package/deps/rocksdb/rocksdb/db/wal_manager.cc +37 -29
  79. package/deps/rocksdb/rocksdb/db/wal_manager.h +6 -5
  80. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +6 -0
  81. package/deps/rocksdb/rocksdb/db/write_batch.cc +54 -23
  82. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +46 -5
  83. package/deps/rocksdb/rocksdb/db/write_thread.cc +53 -5
  84. package/deps/rocksdb/rocksdb/db/write_thread.h +36 -4
  85. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -0
  86. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +5 -0
  87. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +57 -17
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +11 -3
  89. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +8 -4
  90. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +10 -25
  91. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +25 -88
  92. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_filters.cc +93 -0
  93. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_filters.h +16 -0
  94. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +43 -0
  95. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +109 -21
  96. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +8 -0
  97. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +666 -205
  98. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +55 -10
  99. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +18 -16
  100. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +19 -0
  101. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +5 -0
  102. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +782 -494
  103. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +21 -0
  104. package/deps/rocksdb/rocksdb/env/env.cc +6 -0
  105. package/deps/rocksdb/rocksdb/env/io_posix.cc +0 -1
  106. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  107. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +34 -19
  108. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +29 -32
  109. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +41 -15
  110. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +4 -2
  111. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +63 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +16 -5
  113. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +5 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +0 -16
  115. package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +16 -0
  116. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +21 -0
  117. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +76 -3
  118. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +17 -0
  119. package/deps/rocksdb/rocksdb/include/rocksdb/transaction_log.h +12 -6
  120. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +31 -0
  121. package/deps/rocksdb/rocksdb/include/rocksdb/user_write_callback.h +29 -0
  122. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h +4 -2
  123. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h +0 -1
  124. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +17 -8
  125. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +2 -2
  126. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +46 -0
  127. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +7 -0
  128. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  129. package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -2
  130. package/deps/rocksdb/rocksdb/options/cf_options.h +6 -2
  131. package/deps/rocksdb/rocksdb/options/db_options.cc +8 -0
  132. package/deps/rocksdb/rocksdb/options/db_options.h +9 -5
  133. package/deps/rocksdb/rocksdb/options/options.cc +3 -0
  134. package/deps/rocksdb/rocksdb/options/options_helper.cc +1 -0
  135. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +3 -1
  136. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +2 -2
  137. package/deps/rocksdb/rocksdb/port/stack_trace.cc +1 -0
  138. package/deps/rocksdb/rocksdb/port/win/port_win.cc +3 -2
  139. package/deps/rocksdb/rocksdb/src.mk +4 -0
  140. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +1 -2
  141. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +4 -2
  142. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +15 -0
  143. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +102 -41
  144. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +15 -7
  145. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +1 -3
  146. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -6
  147. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +31 -0
  148. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +6 -0
  149. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +10 -5
  150. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +11 -15
  151. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +17 -11
  152. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.h +5 -2
  153. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +28 -21
  154. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +9 -11
  155. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +16 -16
  156. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -2
  157. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +14 -9
  158. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +4 -1
  159. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +82 -41
  160. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +13 -14
  161. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +18 -22
  162. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +51 -13
  163. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +2 -0
  164. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +3 -11
  165. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +2 -3
  166. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +9 -10
  167. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +3 -2
  168. package/deps/rocksdb/rocksdb/table/format.cc +1 -2
  169. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +18 -13
  170. package/deps/rocksdb/rocksdb/table/merging_iterator.h +5 -3
  171. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +2 -2
  172. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +1 -1
  173. package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +3 -1
  174. package/deps/rocksdb/rocksdb/table/table_builder.h +8 -7
  175. package/deps/rocksdb/rocksdb/table/table_reader.h +9 -0
  176. package/deps/rocksdb/rocksdb/test_util/testutil.cc +1 -0
  177. package/deps/rocksdb/rocksdb/test_util/testutil.h +6 -0
  178. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +19 -0
  179. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +434 -110
  180. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +3 -1
  181. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +3 -0
  182. package/deps/rocksdb/rocksdb/util/aligned_storage.h +24 -0
  183. package/deps/rocksdb/rocksdb/util/filter_bench.cc +1 -1
  184. package/deps/rocksdb/rocksdb/util/random.cc +2 -1
  185. package/deps/rocksdb/rocksdb/util/stderr_logger.h +1 -1
  186. package/deps/rocksdb/rocksdb/util/udt_util.cc +33 -0
  187. package/deps/rocksdb/rocksdb/util/udt_util.h +7 -0
  188. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +33 -0
  189. package/deps/rocksdb/rocksdb/util/write_batch_util.h +5 -0
  190. package/deps/rocksdb/rocksdb/util/xxhash.h +10 -3
  191. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +13 -13
  192. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +104 -48
  193. package/deps/rocksdb/rocksdb/utilities/debug.cc +16 -4
  194. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +647 -235
  195. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -157
  196. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +144 -0
  197. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +45 -0
  198. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector_test.cc +139 -0
  199. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +12 -0
  200. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +3 -0
  201. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +105 -6
  202. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +64 -8
  203. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -0
  204. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +43 -5
  205. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +5 -0
  206. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +154 -6
  207. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +1 -1
  208. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +158 -2
  209. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +16 -11
  210. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +4 -4
  211. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +9 -8
  212. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +2 -1
  213. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +43 -7
  214. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +2 -0
  215. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +1 -1
  216. package/package.json +1 -1
  217. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  218. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  219. package/.tap/test-results/node_modules/abstract-level/test/chained-batch-test.js.tap +0 -0
  220. package/.tap/test-results/node_modules/abstract-level/test/get-test.js.tap +0 -0
  221. package/.tap/test-results/test/abstract-level-test.js.tap +0 -1077
  222. package/.tap/test-results/test/batch-test.js.tap +0 -12
  223. package/.tap/test-results/test/chained-batch-gc-test.js.tap +0 -11
  224. package/.tap/test-results/test/cleanup-hanging-iterators-test.js.tap +0 -135
  225. package/.tap/test-results/test/clear-gc-test.js.tap +0 -13
  226. package/.tap/test-results/test/column-test.js.tap +0 -55
  227. package/.tap/test-results/test/common.js.tap +0 -0
  228. package/.tap/test-results/test/compression-test.js.tap +0 -30
  229. package/.tap/test-results/test/db-identity.js.tap +0 -12
  230. package/.tap/test-results/test/electron.js.tap +0 -0
  231. package/.tap/test-results/test/env-cleanup-hook-test.js.tap +0 -40
  232. package/.tap/test-results/test/env-cleanup-hook.js.tap +0 -0
  233. package/.tap/test-results/test/gc.js.tap +0 -0
  234. package/.tap/test-results/test/getproperty-test.js.tap +0 -29
  235. package/.tap/test-results/test/iterator-gc-test.js.tap +0 -15
  236. package/.tap/test-results/test/iterator-hwm-test.js.tap +0 -131
  237. package/.tap/test-results/test/iterator-recursion-test.js.tap +0 -12
  238. package/.tap/test-results/test/iterator-starvation-test.js.tap +0 -73
  239. package/.tap/test-results/test/iterator-test.js.tap +0 -6
  240. package/.tap/test-results/test/leak-tester-batch.js.tap +0 -0
  241. package/.tap/test-results/test/leak-tester-iterator.js.tap +0 -0
  242. package/.tap/test-results/test/leak-tester.js.tap +0 -0
  243. package/.tap/test-results/test/lock-test.js.tap +0 -18
  244. package/.tap/test-results/test/lock.js.tap +0 -0
  245. package/.tap/test-results/test/make.js.tap +0 -0
  246. package/.tap/test-results/test/max-rev-merge.js.tap +0 -0
  247. package/.tap/test-results/test/merge-operator-test.js.tap +0 -12
  248. package/.tap/test-results/test/mkdir-test.js.tap +0 -15
  249. package/.tap/test-results/test/segfault-test.js.tap +0 -76
  250. package/.tap/test-results/test/stack-blower.js.tap +0 -0
  251. package/deps/rocksdb/rocksdb/README.md +0 -29
  252. package/deps/rocksdb/rocksdb/microbench/README.md +0 -60
  253. package/deps/rocksdb/rocksdb/plugin/README.md +0 -43
  254. package/deps/rocksdb/rocksdb/port/README +0 -10
  255. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README +0 -13
@@ -9,6 +9,7 @@
9
9
  #pragma once
10
10
 
11
11
  #include <atomic>
12
+ #include <cstdint>
12
13
  #include <deque>
13
14
  #include <functional>
14
15
  #include <limits>
@@ -57,6 +58,7 @@
57
58
  #include "rocksdb/status.h"
58
59
  #include "rocksdb/trace_reader_writer.h"
59
60
  #include "rocksdb/transaction_log.h"
61
+ #include "rocksdb/user_write_callback.h"
60
62
  #include "rocksdb/utilities/replayer.h"
61
63
  #include "rocksdb/write_buffer_manager.h"
62
64
  #include "table/merging_iterator.h"
@@ -231,6 +233,10 @@ class DBImpl : public DB {
231
233
  using DB::Write;
232
234
  Status Write(const WriteOptions& options, WriteBatch* updates) override;
233
235
 
236
+ using DB::WriteWithCallback;
237
+ Status WriteWithCallback(const WriteOptions& options, WriteBatch* updates,
238
+ UserWriteCallback* user_write_cb) override;
239
+
234
240
  using DB::Get;
235
241
  Status Get(const ReadOptions& _read_options,
236
242
  ColumnFamilyHandle* column_family, const Slice& key,
@@ -501,8 +507,16 @@ class DBImpl : public DB {
501
507
  // All the returned filenames start with "/"
502
508
  Status GetLiveFiles(std::vector<std::string>&, uint64_t* manifest_file_size,
503
509
  bool flush_memtable = true) override;
504
- Status GetSortedWalFiles(VectorLogPtr& files) override;
505
- Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) override;
510
+ Status GetSortedWalFiles(VectorWalPtr& files) override;
511
+ Status GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos);
512
+
513
+ // Get the known flushed sizes of WALs that might still be written to
514
+ // or have pending sync.
515
+ // NOTE: unlike alive_log_files_, this function includes WALs that might
516
+ // be obsolete (but not obsolete to a pending Checkpoint) and not yet fully
517
+ // synced.
518
+ Status GetOpenWalSizes(std::map<uint64_t, uint64_t>& number_to_size);
519
+ Status GetCurrentWalFile(std::unique_ptr<WalFile>* current_log_file) override;
506
520
  Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override;
507
521
 
508
522
  Status GetUpdatesSince(
@@ -688,7 +702,8 @@ class DBImpl : public DB {
688
702
  // thread to determine whether it is safe to perform the write.
689
703
  virtual Status WriteWithCallback(const WriteOptions& write_options,
690
704
  WriteBatch* my_batch,
691
- WriteCallback* callback);
705
+ WriteCallback* callback,
706
+ UserWriteCallback* user_write_cb = nullptr);
692
707
 
693
708
  // Returns the sequence number that is guaranteed to be smaller than or equal
694
709
  // to the sequence number of any key that could be inserted into the current
@@ -1447,6 +1462,9 @@ class DBImpl : public DB {
1447
1462
  Status RenameTempFileToOptionsFile(const std::string& file_name);
1448
1463
  Status DeleteObsoleteOptionsFiles();
1449
1464
 
1465
+ void NotifyOnManualFlushScheduled(autovector<ColumnFamilyData*> cfds,
1466
+ FlushReason flush_reason);
1467
+
1450
1468
  void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
1451
1469
  const MutableCFOptions& mutable_cf_options,
1452
1470
  int job_id, FlushReason flush_reason);
@@ -1497,6 +1515,7 @@ class DBImpl : public DB {
1497
1515
  // batch that does not have duplicate keys.
1498
1516
  Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
1499
1517
  WriteCallback* callback = nullptr,
1518
+ UserWriteCallback* user_write_cb = nullptr,
1500
1519
  uint64_t* log_used = nullptr, uint64_t log_ref = 0,
1501
1520
  bool disable_memtable = false, uint64_t* seq_used = nullptr,
1502
1521
  size_t batch_cnt = 0,
@@ -1505,6 +1524,7 @@ class DBImpl : public DB {
1505
1524
 
1506
1525
  Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
1507
1526
  WriteCallback* callback = nullptr,
1527
+ UserWriteCallback* user_write_cb = nullptr,
1508
1528
  uint64_t* log_used = nullptr, uint64_t log_ref = 0,
1509
1529
  bool disable_memtable = false,
1510
1530
  uint64_t* seq_used = nullptr);
@@ -1531,7 +1551,8 @@ class DBImpl : public DB {
1531
1551
  // marks start of a new sub-batch.
1532
1552
  Status WriteImplWALOnly(
1533
1553
  WriteThread* write_thread, const WriteOptions& options,
1534
- WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
1554
+ WriteBatch* updates, WriteCallback* callback,
1555
+ UserWriteCallback* user_write_cb, uint64_t* log_used,
1535
1556
  const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
1536
1557
  PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
1537
1558
  const PublishLastSeq publish_last_seq, const bool disable_memtable);
@@ -1703,8 +1724,11 @@ class DBImpl : public DB {
1703
1724
  return w;
1704
1725
  }
1705
1726
  Status ClearWriter() {
1706
- // TODO: plumb Env::IOActivity, Env::IOPriority
1707
- Status s = writer->WriteBuffer(WriteOptions());
1727
+ Status s;
1728
+ if (writer->file()) {
1729
+ // TODO: plumb Env::IOActivity, Env::IOPriority
1730
+ s = writer->WriteBuffer(WriteOptions());
1731
+ }
1708
1732
  delete writer;
1709
1733
  writer = nullptr;
1710
1734
  return s;
@@ -1719,10 +1743,16 @@ class DBImpl : public DB {
1719
1743
 
1720
1744
  void PrepareForSync() {
1721
1745
  assert(!getting_synced);
1722
- // Size is expected to be monotonically increasing.
1723
- assert(writer->file()->GetFlushedSize() >= pre_sync_size);
1746
+ // Ensure the head of logs_ is marked as getting_synced if any is.
1724
1747
  getting_synced = true;
1725
- pre_sync_size = writer->file()->GetFlushedSize();
1748
+ // If last sync failed on a later WAL, this could be a fully synced
1749
+ // and closed WAL that just needs to be recorded as synced in the
1750
+ // manifest.
1751
+ if (writer->file()) {
1752
+ // Size is expected to be monotonically increasing.
1753
+ assert(writer->file()->GetFlushedSize() >= pre_sync_size);
1754
+ pre_sync_size = writer->file()->GetFlushedSize();
1755
+ }
1726
1756
  }
1727
1757
 
1728
1758
  void FinishSync() {
@@ -1920,7 +1950,7 @@ class DBImpl : public DB {
1920
1950
  void ReleaseFileNumberFromPendingOutputs(
1921
1951
  std::unique_ptr<std::list<uint64_t>::iterator>& v);
1922
1952
 
1923
- IOStatus SyncClosedLogs(const WriteOptions& write_options,
1953
+ IOStatus SyncClosedWals(const WriteOptions& write_options,
1924
1954
  JobContext* job_context, VersionEdit* synced_wals,
1925
1955
  bool error_recovery_in_prog);
1926
1956
 
@@ -2179,7 +2209,8 @@ class DBImpl : public DB {
2179
2209
  void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
2180
2210
  FlushReason flush_reason, FlushRequest* req);
2181
2211
 
2182
- void SchedulePendingFlush(const FlushRequest& req);
2212
+ // Returns true if `req` is successfully enqueued.
2213
+ bool SchedulePendingFlush(const FlushRequest& req);
2183
2214
 
2184
2215
  void SchedulePendingCompaction(ColumnFamilyData* cfd);
2185
2216
  void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
@@ -2255,6 +2286,11 @@ class DBImpl : public DB {
2255
2286
  ColumnFamilyData* PickCompactionFromQueue(
2256
2287
  std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
2257
2288
 
2289
+ IOStatus SyncWalImpl(bool include_current_wal,
2290
+ const WriteOptions& write_options,
2291
+ JobContext* job_context, VersionEdit* synced_wals,
2292
+ bool error_recovery_in_prog);
2293
+
2258
2294
  // helper function to call after some of the logs_ were synced
2259
2295
  void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
2260
2296
  Status ApplyWALToManifest(const ReadOptions& read_options,
@@ -2312,6 +2348,9 @@ class DBImpl : public DB {
2312
2348
  bool HaveManualCompaction(ColumnFamilyData* cfd);
2313
2349
  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
2314
2350
  void UpdateDeletionCompactionStats(const std::unique_ptr<Compaction>& c);
2351
+
2352
+ // May open and read table files for table property.
2353
+ // Should not be called while holding mutex_.
2315
2354
  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
2316
2355
  const Status& st,
2317
2356
  const CompactionJobStats& compaction_job_stats,
@@ -2527,8 +2566,10 @@ class DBImpl : public DB {
2527
2566
 
2528
2567
  bool persistent_stats_cfd_exists_ = true;
2529
2568
 
2530
- // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
2531
- // as follows:
2569
+ // The current WAL file and those that have not been found obsolete from
2570
+ // memtable flushes. A WAL not on this list might still be pending writer
2571
+ // flush and/or sync and close and might still be in logs_. alive_log_files_
2572
+ // is protected by mutex_ and log_write_mutex_ with details as follows:
2532
2573
  // 1. read by FindObsoleteFiles() which can be called in either application
2533
2574
  // thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
2534
2575
  // held.
@@ -2579,7 +2620,7 @@ class DBImpl : public DB {
2579
2620
  // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
2580
2621
  // log_write_mutex_.
2581
2622
  // 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
2582
- // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
2623
+ // 10. read by SyncClosedWals() protected by only log_write_mutex_. This can
2583
2624
  // happen in bg flush threads after DB::Open() returns success to
2584
2625
  // applications.
2585
2626
  // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
@@ -2592,7 +2633,7 @@ class DBImpl : public DB {
2592
2633
  // 13. emplace_back() by SwitchMemtable() hold both mutex_ and
2593
2634
  // log_write_mutex_. This happens in the write group leader. Can conflict
2594
2635
  // with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
2595
- // SyncClosedLogs(), etc. as well as application threads calling
2636
+ // SyncClosedWals(), etc. as well as application threads calling
2596
2637
  // FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
2597
2638
  // require at least log_write_mutex_.
2598
2639
  // 14. iteration called in WriteToWAL(write_group) protected by
@@ -87,6 +87,9 @@ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT(
87
87
  mutex_.AssertHeld();
88
88
  assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1);
89
89
  ColumnFamilyData* cfd = flush_req.cfd_to_max_mem_id_to_persist.begin()->first;
90
+ if (cfd->GetAndClearFlushSkipReschedule()) {
91
+ return false;
92
+ }
90
93
  uint64_t max_memtable_id =
91
94
  flush_req.cfd_to_max_mem_id_to_persist.begin()->second;
92
95
  if (cfd->IsDropped() ||
@@ -98,15 +101,20 @@ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT(
98
101
  // alleviated if we continue with the flush instead of postponing it.
99
102
  const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
100
103
 
101
- // Taking the status of the active Memtable into consideration so that we are
102
- // not just checking if DB is currently already in write stall mode.
103
- int mem_to_flush = cfd->mem()->ApproximateMemoryUsageFast() >=
104
- cfd->mem()->write_buffer_size() / 2
105
- ? 1
106
- : 0;
104
+ // Use the same criteria as WaitUntilFlushWouldNotStallWrites does w.r.t
105
+ // defining what a write stall is about to happen means. If this uses a
106
+ // stricter criteria, for example, a write stall is about to happen if the
107
+ // last memtable is 10% full, there is a possibility that manual flush could
108
+ // be waiting in `WaitUntilFlushWouldNotStallWrites` with the incorrect
109
+ // expectation that others will clear up the excessive memtables and
110
+ // eventually let it proceed. The others in this case won't start clearing
111
+ // until the last memtable is 10% full. To avoid that scenario, the criteria
112
+ // this uses should be the same or less strict than
113
+ // `WaitUntilFlushWouldNotStallWrites` does.
107
114
  WriteStallCondition write_stall =
108
115
  ColumnFamilyData::GetWriteStallConditionAndCause(
109
- cfd->imm()->NumNotFlushed() + mem_to_flush, /*num_l0_files=*/0,
116
+ cfd->GetUnflushedMemTableCountForWriteStallCheck(),
117
+ /*num_l0_files=*/0,
110
118
  /*num_compaction_needed_bytes=*/0, mutable_cf_options,
111
119
  *cfd->ioptions())
112
120
  .first;
@@ -116,89 +124,19 @@ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT(
116
124
  return true;
117
125
  }
118
126
 
119
- IOStatus DBImpl::SyncClosedLogs(const WriteOptions& write_options,
127
+ IOStatus DBImpl::SyncClosedWals(const WriteOptions& write_options,
120
128
  JobContext* job_context,
121
129
  VersionEdit* synced_wals,
122
130
  bool error_recovery_in_prog) {
123
- TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
124
- InstrumentedMutexLock l(&log_write_mutex_);
125
- autovector<log::Writer*, 1> logs_to_sync;
126
- uint64_t current_log_number = logfile_number_;
127
- while (logs_.front().number < current_log_number &&
128
- logs_.front().IsSyncing()) {
129
- log_sync_cv_.Wait();
130
- }
131
- for (auto it = logs_.begin();
132
- it != logs_.end() && it->number < current_log_number; ++it) {
133
- auto& log = *it;
134
- log.PrepareForSync();
135
- logs_to_sync.push_back(log.writer);
136
- }
137
-
138
- IOStatus io_s;
139
- if (!logs_to_sync.empty()) {
140
- log_write_mutex_.Unlock();
141
-
142
- assert(job_context);
143
-
144
- for (log::Writer* log : logs_to_sync) {
145
- ROCKS_LOG_INFO(immutable_db_options_.info_log,
146
- "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
147
- log->get_log_number());
148
- if (error_recovery_in_prog) {
149
- log->file()->reset_seen_error();
150
- }
151
-
152
- IOOptions io_options;
153
- io_s = WritableFileWriter::PrepareIOOptions(write_options, io_options);
154
- if (!io_s.ok()) {
155
- break;
156
- }
157
- io_s = log->file()->Sync(io_options, immutable_db_options_.use_fsync);
158
- if (!io_s.ok()) {
159
- break;
160
- }
161
-
162
- if (immutable_db_options_.recycle_log_file_num > 0) {
163
- if (error_recovery_in_prog) {
164
- log->file()->reset_seen_error();
165
- }
166
- // Normally the log file is closed when purging obsolete file, but if
167
- // log recycling is enabled, the log file is closed here so that it
168
- // can be reused.
169
- io_s = log->Close(write_options);
170
- if (!io_s.ok()) {
171
- break;
172
- }
173
- }
174
- }
175
- if (io_s.ok()) {
176
- IOOptions io_options;
177
- io_s = WritableFileWriter::PrepareIOOptions(write_options, io_options);
178
- if (io_s.ok()) {
179
- io_s = directories_.GetWalDir()->FsyncWithDirOptions(
180
- io_options, nullptr,
181
- DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
182
- }
183
- }
184
-
185
- TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock",
186
- /*arg=*/nullptr);
187
- log_write_mutex_.Lock();
131
+ TEST_SYNC_POINT("DBImpl::SyncClosedWals:Start");
188
132
 
189
- // "number <= current_log_number - 1" is equivalent to
190
- // "number < current_log_number".
191
- if (io_s.ok()) {
192
- MarkLogsSynced(current_log_number - 1, true, synced_wals);
193
- } else {
194
- MarkLogsNotSynced(current_log_number - 1);
195
- }
196
- if (!io_s.ok()) {
197
- TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
198
- return io_s;
199
- }
133
+ IOStatus io_s = SyncWalImpl(/*include_current_wal*/ false, write_options,
134
+ job_context, synced_wals, error_recovery_in_prog);
135
+ if (!io_s.ok()) {
136
+ TEST_SYNC_POINT("DBImpl::SyncClosedWals:Failed");
137
+ } else {
138
+ TEST_SYNC_POINT("DBImpl::SyncClosedWals:end");
200
139
  }
201
- TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end");
202
140
  return io_s;
203
141
  }
204
142
 
@@ -237,12 +175,12 @@ Status DBImpl::FlushMemTableToOutputFile(
237
175
  // If needs_to_sync_closed_wals is true, we need to record the current
238
176
  // maximum memtable ID of this column family so that a later PickMemtables()
239
177
  // call will not pick memtables whose IDs are higher. This is due to the fact
240
- // that SyncClosedLogs() may release the db mutex, and memtable switch can
178
+ // that SyncClosedWals() may release the db mutex, and memtable switch can
241
179
  // happen for this column family in the meantime. The newly created memtables
242
180
  // have their data backed by unsynced WALs, thus they cannot be included in
243
181
  // this flush job.
244
182
  // Another reason why we must record the current maximum memtable ID of this
245
- // column family: SyncClosedLogs() may release db mutex, thus it's possible
183
+ // column family: SyncClosedWals() may release db mutex, thus it's possible
246
184
  // for application to continue to insert into memtables increasing db's
247
185
  // sequence number. The application may take a snapshot, but this snapshot is
248
186
  // not included in `snapshot_seqs` which will be passed to flush job because
@@ -256,7 +194,7 @@ Status DBImpl::FlushMemTableToOutputFile(
256
194
 
257
195
  // If needs_to_sync_closed_wals is false, then the flush job will pick ALL
258
196
  // existing memtables of the column family when PickMemTable() is called
259
- // later. Although we won't call SyncClosedLogs() in this case, we may still
197
+ // later. Although we won't call SyncClosedWals() in this case, we may still
260
198
  // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also
261
199
  // releases and re-acquires the db mutex. In the meantime, the application
262
200
  // can still insert into the memtables and increase the db's sequence number.
@@ -286,12 +224,12 @@ Status DBImpl::FlushMemTableToOutputFile(
286
224
  bool need_cancel = false;
287
225
  IOStatus log_io_s = IOStatus::OK();
288
226
  if (needs_to_sync_closed_wals) {
289
- // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple
227
+ // SyncClosedWals() may unlock and re-lock the log_write_mutex multiple
290
228
  // times.
291
229
  VersionEdit synced_wals;
292
230
  bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress();
293
231
  mutex_.Unlock();
294
- log_io_s = SyncClosedLogs(write_options, job_context, &synced_wals,
232
+ log_io_s = SyncClosedWals(write_options, job_context, &synced_wals,
295
233
  error_recovery_in_prog);
296
234
  mutex_.Lock();
297
235
  if (log_io_s.ok() && synced_wals.IsWalAddition()) {
@@ -306,7 +244,7 @@ Status DBImpl::FlushMemTableToOutputFile(
306
244
  error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
307
245
  }
308
246
  } else {
309
- TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
247
+ TEST_SYNC_POINT("DBImpl::SyncClosedWals:Skip");
310
248
  }
311
249
  s = log_io_s;
312
250
 
@@ -580,7 +518,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
580
518
  VersionEdit synced_wals;
581
519
  bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress();
582
520
  mutex_.Unlock();
583
- log_io_s = SyncClosedLogs(write_options, job_context, &synced_wals,
521
+ log_io_s = SyncClosedWals(write_options, job_context, &synced_wals,
584
522
  error_recovery_in_prog);
585
523
  mutex_.Lock();
586
524
  if (log_io_s.ok() && synced_wals.IsWalAddition()) {
@@ -1657,10 +1595,12 @@ Status DBImpl::CompactFilesImpl(
1657
1595
 
1658
1596
  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
1659
1597
 
1598
+ mutex_.Unlock();
1660
1599
  if (compaction_job_info != nullptr) {
1661
1600
  BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
1662
1601
  job_context->job_id, compaction_job_info);
1663
1602
  }
1603
+ mutex_.Lock();
1664
1604
 
1665
1605
  if (status.ok()) {
1666
1606
  // Done
@@ -2310,6 +2250,23 @@ void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
2310
2250
  }
2311
2251
  }
2312
2252
 
2253
+ void DBImpl::NotifyOnManualFlushScheduled(autovector<ColumnFamilyData*> cfds,
2254
+ FlushReason flush_reason) {
2255
+ if (immutable_db_options_.listeners.size() == 0U) {
2256
+ return;
2257
+ }
2258
+ if (shutting_down_.load(std::memory_order_acquire)) {
2259
+ return;
2260
+ }
2261
+ std::vector<ManualFlushInfo> info;
2262
+ for (ColumnFamilyData* cfd : cfds) {
2263
+ info.push_back({cfd->GetID(), cfd->GetName(), flush_reason});
2264
+ }
2265
+ for (const auto& listener : immutable_db_options_.listeners) {
2266
+ listener->OnManualFlushScheduled(this, info);
2267
+ }
2268
+ }
2269
+
2313
2270
  Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2314
2271
  const FlushOptions& flush_options,
2315
2272
  FlushReason flush_reason,
@@ -2414,7 +2371,14 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2414
2371
  }
2415
2372
  }
2416
2373
  for (const auto& req : flush_reqs) {
2417
- SchedulePendingFlush(req);
2374
+ assert(req.cfd_to_max_mem_id_to_persist.size() == 1);
2375
+ ColumnFamilyData* loop_cfd =
2376
+ req.cfd_to_max_mem_id_to_persist.begin()->first;
2377
+ bool already_queued_for_flush = loop_cfd->queued_for_flush();
2378
+ bool flush_req_enqueued = SchedulePendingFlush(req);
2379
+ if (already_queued_for_flush || flush_req_enqueued) {
2380
+ loop_cfd->SetFlushSkipReschedule();
2381
+ }
2418
2382
  }
2419
2383
  MaybeScheduleFlushOrCompaction();
2420
2384
  }
@@ -2426,6 +2390,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2426
2390
  }
2427
2391
  }
2428
2392
  }
2393
+
2394
+ NotifyOnManualFlushScheduled({cfd}, flush_reason);
2429
2395
  TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
2430
2396
  TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
2431
2397
  if (s.ok() && flush_options.wait) {
@@ -2570,6 +2536,7 @@ Status DBImpl::AtomicFlushMemTables(
2570
2536
  }
2571
2537
  }
2572
2538
  }
2539
+ NotifyOnManualFlushScheduled(cfds, flush_reason);
2573
2540
  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
2574
2541
  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
2575
2542
  if (s.ok() && flush_options.wait) {
@@ -2627,7 +2594,9 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
2627
2594
  flush_reason,
2628
2595
  {{cfd,
2629
2596
  std::numeric_limits<uint64_t>::max() /* max_mem_id_to_persist */}}};
2630
- SchedulePendingFlush(flush_req);
2597
+ if (SchedulePendingFlush(flush_req)) {
2598
+ cfd->SetFlushSkipReschedule();
2599
+ };
2631
2600
  }
2632
2601
  }
2633
2602
  MaybeScheduleFlushOrCompaction();
@@ -2715,13 +2684,13 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
2715
2684
  // mode due to pending compaction bytes, but that's less common
2716
2685
  // No extra immutable Memtable will be created if the current Memtable is
2717
2686
  // empty.
2718
- int mem_to_flush = cfd->mem()->IsEmpty() ? 0 : 1;
2719
- write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
2720
- cfd->imm()->NumNotFlushed() + mem_to_flush,
2721
- vstorage->l0_delay_trigger_count() + 1,
2722
- vstorage->estimated_compaction_needed_bytes(),
2723
- mutable_cf_options, *cfd->ioptions())
2724
- .first;
2687
+ write_stall_condition =
2688
+ ColumnFamilyData::GetWriteStallConditionAndCause(
2689
+ cfd->GetUnflushedMemTableCountForWriteStallCheck(),
2690
+ vstorage->l0_delay_trigger_count() + 1,
2691
+ vstorage->estimated_compaction_needed_bytes(), mutable_cf_options,
2692
+ *cfd->ioptions())
2693
+ .first;
2725
2694
  } while (write_stall_condition != WriteStallCondition::kNormal);
2726
2695
  }
2727
2696
  return Status::OK();
@@ -3033,13 +3002,14 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue(
3033
3002
  return cfd;
3034
3003
  }
3035
3004
 
3036
- void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
3005
+ bool DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
3037
3006
  mutex_.AssertHeld();
3007
+ bool enqueued = false;
3038
3008
  if (reject_new_background_jobs_) {
3039
- return;
3009
+ return enqueued;
3040
3010
  }
3041
3011
  if (flush_req.cfd_to_max_mem_id_to_persist.empty()) {
3042
- return;
3012
+ return enqueued;
3043
3013
  }
3044
3014
  if (!immutable_db_options_.atomic_flush) {
3045
3015
  // For the non-atomic flush case, we never schedule multiple column
@@ -3054,6 +3024,7 @@ void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
3054
3024
  cfd->set_queued_for_flush(true);
3055
3025
  ++unscheduled_flushes_;
3056
3026
  flush_queue_.push_back(flush_req);
3027
+ enqueued = true;
3057
3028
  }
3058
3029
  } else {
3059
3030
  for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
@@ -3062,7 +3033,9 @@ void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
3062
3033
  }
3063
3034
  ++unscheduled_flushes_;
3064
3035
  flush_queue_.push_back(flush_req);
3036
+ enqueued = true;
3065
3037
  }
3038
+ return enqueued;
3066
3039
  }
3067
3040
 
3068
3041
  void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
@@ -4190,7 +4163,7 @@ void DBImpl::BuildCompactionJobInfo(
4190
4163
  compaction_job_info->base_input_level = c->start_level();
4191
4164
  compaction_job_info->output_level = c->output_level();
4192
4165
  compaction_job_info->stats = compaction_job_stats;
4193
- const auto& input_table_properties = c->GetInputTableProperties();
4166
+ const auto& input_table_properties = c->GetOrInitInputTableProperties();
4194
4167
  const auto& output_table_properties = c->GetOutputTableProperties();
4195
4168
  compaction_job_info->table_properties.insert(input_table_properties.begin(),
4196
4169
  input_table_properties.end());
@@ -312,6 +312,26 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
312
312
  // logs_ could have changed while we were waiting.
313
313
  continue;
314
314
  }
315
+ // This WAL file is not live, so it's OK if we never sync the rest of it.
316
+ // If it's already closed, then it's been fully synced. If
317
+ // !background_close_inactive_wals then we need to Close it before
318
+ // removing from logs_ but not blocking while holding log_write_mutex_.
319
+ if (!immutable_db_options_.background_close_inactive_wals &&
320
+ log.writer->file()) {
321
+ // We are taking ownership of and pinning the front entry, so we can
322
+ // expect it to be the same after releasing and re-acquiring the lock
323
+ log.PrepareForSync();
324
+ log_write_mutex_.Unlock();
325
+ // TODO: maybe check the return value of Close.
326
+ // TODO: plumb Env::IOActivity, Env::IOPriority
327
+ auto s = log.writer->file()->Close({});
328
+ s.PermitUncheckedError();
329
+ log_write_mutex_.Lock();
330
+ log.writer->PublishIfClosed();
331
+ assert(&log == &logs_.front());
332
+ log.FinishSync();
333
+ log_sync_cv_.SignalAll();
334
+ }
315
335
  logs_to_free_.push_back(log.ReleaseWriter());
316
336
  logs_.pop_front();
317
337
  }
@@ -410,12 +430,24 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
410
430
  state.manifest_delete_files.size());
411
431
  // We may ignore the dbname when generating the file names.
412
432
  for (auto& file : state.sst_delete_files) {
413
- if (!file.only_delete_metadata) {
414
- candidate_files.emplace_back(
415
- MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
416
- }
417
- if (file.metadata->table_reader_handle) {
418
- table_cache_->Release(file.metadata->table_reader_handle);
433
+ auto* handle = file.metadata->table_reader_handle;
434
+ if (file.only_delete_metadata) {
435
+ if (handle) {
436
+ // Simply release handle of file that is not being deleted
437
+ table_cache_->Release(handle);
438
+ }
439
+ } else {
440
+ // File is being deleted (actually obsolete)
441
+ auto number = file.metadata->fd.GetNumber();
442
+ candidate_files.emplace_back(MakeTableFileName(number), file.path);
443
+ if (handle == nullptr) {
444
+ // For files not "pinned" in table cache
445
+ handle = TableCache::Lookup(table_cache_.get(), number);
446
+ }
447
+ if (handle) {
448
+ TableCache::ReleaseObsolete(table_cache_.get(), handle,
449
+ file.uncache_aggressiveness);
450
+ }
419
451
  }
420
452
  file.DeleteMetadata();
421
453
  }
@@ -491,7 +523,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
491
523
  for (const auto w : state.logs_to_free) {
492
524
  // TODO: maybe check the return value of Close.
493
525
  // TODO: plumb Env::IOActivity, Env::IOPriority
494
- auto s = w->Close(WriteOptions());
526
+ auto s = w->Close({});
495
527
  s.PermitUncheckedError();
496
528
  }
497
529
 
@@ -577,8 +609,6 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
577
609
  std::string fname;
578
610
  std::string dir_to_sync;
579
611
  if (type == kTableFile) {
580
- // evict from cache
581
- TableCache::Evict(table_cache_.get(), number);
582
612
  fname = MakeTableFileName(candidate_file.file_path, number);
583
613
  dir_to_sync = candidate_file.file_path;
584
614
  } else if (type == kBlobFile) {
@@ -5,6 +5,7 @@
5
5
 
6
6
  #include "db/db_impl/db_impl_follower.h"
7
7
 
8
+ #include <algorithm>
8
9
  #include <cinttypes>
9
10
 
10
11
  #include "db/arena_wrapped_db_iter.h"