@nxtedition/rocksdb 13.5.7 → 13.5.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (509) hide show
  1. package/binding.cc +248 -70
  2. package/binding.gyp +2 -2
  3. package/deps/rocksdb/rocksdb/BUCK +12 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -0
  5. package/deps/rocksdb/rocksdb/Makefile +28 -23
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -1
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -2
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +43 -39
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +0 -1
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +2 -3
  12. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  13. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +1 -3
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +11 -1
  15. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +13 -5
  16. package/deps/rocksdb/rocksdb/crash_test.mk +61 -15
  17. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +136 -45
  18. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +34 -16
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +10 -7
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -2
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +1 -0
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +12 -9
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +3 -4
  24. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +2 -2
  25. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +3 -4
  26. package/deps/rocksdb/rocksdb/db/builder.cc +22 -8
  27. package/deps/rocksdb/rocksdb/db/builder.h +5 -4
  28. package/deps/rocksdb/rocksdb/db/c.cc +556 -15
  29. package/deps/rocksdb/rocksdb/db/c_test.c +133 -12
  30. package/deps/rocksdb/rocksdb/db/column_family.cc +114 -50
  31. package/deps/rocksdb/rocksdb/db/column_family.h +53 -36
  32. package/deps/rocksdb/rocksdb/db/column_family_test.cc +6 -6
  33. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +0 -1
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +95 -70
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +71 -51
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -86
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +26 -68
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +0 -122
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +453 -258
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +117 -92
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +0 -1
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +38 -38
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +24 -17
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +34 -45
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -31
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -3
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +1 -1
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +10 -10
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +2 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +82 -34
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +267 -179
  53. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +4 -1
  54. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +273 -89
  55. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +300 -14
  56. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +4 -4
  57. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +2 -2
  58. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +28 -23
  59. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +69 -51
  60. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +522 -245
  61. package/deps/rocksdb/rocksdb/db/convenience.cc +15 -4
  62. package/deps/rocksdb/rocksdb/db/corruption_test.cc +1 -3
  63. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +0 -2
  64. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +196 -17
  65. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +74 -62
  66. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +48 -0
  67. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +682 -250
  68. package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +0 -1
  69. package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +3 -4
  70. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +11 -16
  71. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +57 -0
  72. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +2 -2
  73. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -1
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +540 -490
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +347 -188
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +584 -217
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +13 -9
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -7
  79. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +40 -36
  80. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -3
  81. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +751 -372
  82. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +35 -32
  83. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +24 -2
  84. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +125 -63
  85. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +2 -2
  86. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +311 -196
  87. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +15 -5
  88. package/deps/rocksdb/rocksdb/db/db_iter.cc +42 -29
  89. package/deps/rocksdb/rocksdb/db/db_iter.h +96 -31
  90. package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +3 -4
  91. package/deps/rocksdb/rocksdb/db/db_iter_test.cc +168 -228
  92. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +454 -0
  93. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +8 -8
  94. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +0 -1
  95. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +90 -0
  96. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +60 -2
  97. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +7 -3
  98. package/deps/rocksdb/rocksdb/db/db_options_test.cc +85 -27
  99. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -1
  100. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +0 -2
  101. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +114 -2
  102. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +0 -1
  103. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +0 -1
  104. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +51 -3
  105. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +0 -1
  106. package/deps/rocksdb/rocksdb/db/db_test.cc +325 -18
  107. package/deps/rocksdb/rocksdb/db/db_test2.cc +644 -20
  108. package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
  109. package/deps/rocksdb/rocksdb/db/db_test_util.h +9 -0
  110. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +64 -45
  111. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +203 -14
  112. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +259 -30
  113. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +0 -1
  114. package/deps/rocksdb/rocksdb/db/db_write_test.cc +75 -1
  115. package/deps/rocksdb/rocksdb/db/dbformat.h +70 -6
  116. package/deps/rocksdb/rocksdb/db/deletefile_test.cc +0 -190
  117. package/deps/rocksdb/rocksdb/db/error_handler.cc +22 -7
  118. package/deps/rocksdb/rocksdb/db/error_handler.h +16 -1
  119. package/deps/rocksdb/rocksdb/db/event_helpers.cc +41 -26
  120. package/deps/rocksdb/rocksdb/db/experimental.cc +4 -3
  121. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +464 -78
  122. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +166 -69
  123. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +54 -25
  124. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -3
  125. package/deps/rocksdb/rocksdb/db/flush_job.cc +98 -81
  126. package/deps/rocksdb/rocksdb/db/flush_job.h +4 -9
  127. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +80 -84
  128. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +1 -1
  129. package/deps/rocksdb/rocksdb/db/forward_iterator.h +2 -2
  130. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +12 -19
  131. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +0 -2
  132. package/deps/rocksdb/rocksdb/db/internal_stats.cc +41 -15
  133. package/deps/rocksdb/rocksdb/db/internal_stats.h +63 -52
  134. package/deps/rocksdb/rocksdb/db/job_context.h +59 -24
  135. package/deps/rocksdb/rocksdb/db/listener_test.cc +69 -10
  136. package/deps/rocksdb/rocksdb/db/log_format.h +11 -2
  137. package/deps/rocksdb/rocksdb/db/log_reader.cc +147 -34
  138. package/deps/rocksdb/rocksdb/db/log_reader.h +40 -11
  139. package/deps/rocksdb/rocksdb/db/log_test.cc +16 -3
  140. package/deps/rocksdb/rocksdb/db/log_writer.cc +102 -55
  141. package/deps/rocksdb/rocksdb/db/log_writer.h +21 -2
  142. package/deps/rocksdb/rocksdb/db/malloc_stats.h +0 -2
  143. package/deps/rocksdb/rocksdb/db/memtable.cc +16 -47
  144. package/deps/rocksdb/rocksdb/db/memtable.h +76 -12
  145. package/deps/rocksdb/rocksdb/db/memtable_list.cc +23 -20
  146. package/deps/rocksdb/rocksdb/db/memtable_list.h +9 -11
  147. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +18 -37
  148. package/deps/rocksdb/rocksdb/db/merge_context.h +2 -1
  149. package/deps/rocksdb/rocksdb/db/merge_test.cc +8 -0
  150. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +3 -5
  151. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +15 -7
  152. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +6 -3
  153. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +22 -4
  154. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +41 -1
  155. package/deps/rocksdb/rocksdb/db/prefix_test.cc +0 -1
  156. package/deps/rocksdb/rocksdb/db/repair.cc +29 -34
  157. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -1
  158. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +14 -15
  159. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +1 -3
  160. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +47 -1
  161. package/deps/rocksdb/rocksdb/db/table_cache.cc +3 -3
  162. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +1 -3
  163. package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +2 -1
  164. package/deps/rocksdb/rocksdb/db/version_builder.cc +2 -2
  165. package/deps/rocksdb/rocksdb/db/version_edit.cc +8 -37
  166. package/deps/rocksdb/rocksdb/db/version_edit.h +32 -1
  167. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +26 -18
  168. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -5
  169. package/deps/rocksdb/rocksdb/db/version_set.cc +282 -197
  170. package/deps/rocksdb/rocksdb/db/version_set.h +54 -57
  171. package/deps/rocksdb/rocksdb/db/version_set_test.cc +28 -35
  172. package/deps/rocksdb/rocksdb/db/version_util.h +2 -3
  173. package/deps/rocksdb/rocksdb/db/wal_manager.cc +3 -2
  174. package/deps/rocksdb/rocksdb/db/wal_manager.h +0 -1
  175. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +0 -1
  176. package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +1 -0
  177. package/deps/rocksdb/rocksdb/db/write_batch.cc +22 -8
  178. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +5 -4
  179. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +7 -6
  180. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -4
  181. package/deps/rocksdb/rocksdb/db/write_thread.h +3 -3
  182. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +13 -5
  183. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +9 -2
  184. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +39 -0
  185. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +65 -0
  186. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +45 -22
  187. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +7 -4
  188. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +22 -5
  189. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +28 -3
  190. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -38
  191. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -3
  192. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +80 -32
  193. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +51 -2
  194. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +23 -1
  195. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +305 -15
  196. package/deps/rocksdb/rocksdb/env/env.cc +32 -2
  197. package/deps/rocksdb/rocksdb/env/env_encryption.cc +0 -2
  198. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +2 -4
  199. package/deps/rocksdb/rocksdb/env/env_posix.cc +4 -2
  200. package/deps/rocksdb/rocksdb/env/env_test.cc +0 -1
  201. package/deps/rocksdb/rocksdb/env/fs_posix.cc +20 -11
  202. package/deps/rocksdb/rocksdb/env/fs_readonly.h +0 -2
  203. package/deps/rocksdb/rocksdb/env/fs_remap.cc +0 -2
  204. package/deps/rocksdb/rocksdb/env/fs_remap.h +0 -2
  205. package/deps/rocksdb/rocksdb/env/io_posix.cc +6 -4
  206. package/deps/rocksdb/rocksdb/env/io_posix.h +3 -2
  207. package/deps/rocksdb/rocksdb/env/mock_env.cc +0 -1
  208. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +2 -2
  209. package/deps/rocksdb/rocksdb/file/delete_scheduler.h +0 -2
  210. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +0 -2
  211. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +30 -21
  212. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +16 -0
  213. package/deps/rocksdb/rocksdb/file/file_util.cc +32 -14
  214. package/deps/rocksdb/rocksdb/file/file_util.h +22 -5
  215. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +229 -76
  216. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +21 -12
  217. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +10 -7
  218. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +12 -8
  219. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +1 -2
  220. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +0 -2
  221. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +3 -3
  222. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +598 -0
  223. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_iterator.h +36 -0
  224. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +70 -11
  225. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +232 -11
  226. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -1
  227. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -1
  228. package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +149 -15
  229. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +17 -2
  230. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +132 -34
  231. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +158 -79
  232. package/deps/rocksdb/rocksdb/include/rocksdb/db_bench_tool.h +2 -1
  233. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -5
  234. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +1 -3
  235. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
  236. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +275 -0
  237. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +2 -1
  238. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +50 -5
  239. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +10 -0
  240. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +13 -0
  241. package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +0 -1
  242. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +5 -2
  243. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +13 -0
  244. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +237 -0
  245. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +230 -39
  246. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +15 -0
  247. package/deps/rocksdb/rocksdb/include/rocksdb/perf_level.h +31 -11
  248. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +41 -0
  249. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +1 -1
  250. package/deps/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h +0 -1
  251. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +5 -1
  252. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +0 -1
  253. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -3
  254. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +2 -0
  255. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -8
  256. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +19 -2
  257. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -1
  258. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +124 -0
  259. package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +1 -0
  260. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +26 -1
  261. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +55 -6
  262. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +3 -5
  263. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +0 -2
  264. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -2
  265. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +0 -1
  266. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +1 -2
  267. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +0 -1
  268. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +96 -8
  269. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index_faiss.h +117 -0
  270. package/deps/rocksdb/rocksdb/{utilities/secondary_index/faiss_ivf_index.h → include/rocksdb/utilities/secondary_index_simple.h} +11 -14
  271. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +26 -11
  272. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +16 -3
  273. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -2
  274. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +63 -7
  275. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +0 -1
  276. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +28 -12
  277. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +3 -3
  278. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +0 -2
  279. package/deps/rocksdb/rocksdb/logging/event_logger_test.cc +1 -2
  280. package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +1 -1
  281. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +0 -1
  282. package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +0 -1
  283. package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +3 -1
  284. package/deps/rocksdb/rocksdb/memtable/skiplist.h +2 -2
  285. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +2 -4
  286. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +69 -8
  287. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +32 -9
  288. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +58 -45
  289. package/deps/rocksdb/rocksdb/monitoring/histogram.h +1 -1
  290. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -3
  291. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +5 -0
  292. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +1 -1
  293. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +3 -2
  294. package/deps/rocksdb/rocksdb/options/cf_options.cc +44 -13
  295. package/deps/rocksdb/rocksdb/options/cf_options.h +21 -7
  296. package/deps/rocksdb/rocksdb/options/configurable.cc +5 -5
  297. package/deps/rocksdb/rocksdb/options/configurable_test.h +1 -2
  298. package/deps/rocksdb/rocksdb/options/customizable.cc +0 -1
  299. package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -11
  300. package/deps/rocksdb/rocksdb/options/db_options.cc +18 -15
  301. package/deps/rocksdb/rocksdb/options/db_options.h +2 -2
  302. package/deps/rocksdb/rocksdb/options/options.cc +296 -305
  303. package/deps/rocksdb/rocksdb/options/options_helper.cc +188 -62
  304. package/deps/rocksdb/rocksdb/options/options_helper.h +3 -3
  305. package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -4
  306. package/deps/rocksdb/rocksdb/options/options_parser.h +0 -1
  307. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +17 -4
  308. package/deps/rocksdb/rocksdb/options/options_test.cc +101 -76
  309. package/deps/rocksdb/rocksdb/port/lang.h +2 -1
  310. package/deps/rocksdb/rocksdb/port/port_posix.cc +2 -1
  311. package/deps/rocksdb/rocksdb/port/stack_trace.cc +5 -4
  312. package/deps/rocksdb/rocksdb/port/win/env_win.cc +3 -2
  313. package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +99 -1
  314. package/deps/rocksdb/rocksdb/port/win/xpress_win.h +6 -0
  315. package/deps/rocksdb/rocksdb/src.mk +17 -11
  316. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +0 -1
  317. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1094 -929
  318. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +6 -19
  319. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +76 -22
  320. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +2 -0
  321. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +221 -131
  322. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +12 -9
  323. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +23 -24
  324. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -38
  325. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +7 -4
  326. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +5 -5
  327. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +10 -12
  328. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +6 -4
  329. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +35 -43
  330. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +2 -1
  331. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +1 -1
  332. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -2
  333. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -4
  334. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +0 -1
  335. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +3 -3
  336. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +3 -3
  337. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -4
  338. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  339. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -5
  340. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +4 -4
  341. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +37 -35
  342. package/deps/rocksdb/rocksdb/table/block_fetcher.h +11 -7
  343. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +4 -3
  344. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +31 -5
  345. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +2 -1
  346. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +0 -1
  347. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +0 -1
  348. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +0 -1
  349. package/deps/rocksdb/rocksdb/table/external_table.cc +483 -0
  350. package/deps/rocksdb/rocksdb/table/format.cc +62 -44
  351. package/deps/rocksdb/rocksdb/table/format.h +35 -12
  352. package/deps/rocksdb/rocksdb/table/internal_iterator.h +3 -13
  353. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
  354. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +6 -0
  355. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +150 -141
  356. package/deps/rocksdb/rocksdb/table/meta_blocks.h +5 -0
  357. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -2
  358. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +8 -0
  359. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +0 -1
  360. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +0 -2
  361. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +0 -2
  362. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +0 -1
  363. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +6 -6
  364. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  365. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +86 -7
  366. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +88 -2
  367. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +0 -1
  368. package/deps/rocksdb/rocksdb/table/table_builder.h +10 -1
  369. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +3 -2
  370. package/deps/rocksdb/rocksdb/table/table_test.cc +899 -22
  371. package/deps/rocksdb/rocksdb/test_util/testutil.cc +3 -4
  372. package/deps/rocksdb/rocksdb/test_util/testutil.h +132 -1
  373. package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +0 -1
  374. package/deps/rocksdb/rocksdb/test_util/transaction_test_util.h +0 -2
  375. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +163 -77
  376. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +0 -2
  377. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +0 -1
  378. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +0 -1
  379. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +120 -52
  380. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +1 -0
  381. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -1
  382. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +0 -2
  383. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +2 -2
  384. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +0 -2
  385. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +2 -1
  386. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +94 -0
  387. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +0 -1
  388. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +0 -1
  389. package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +1 -1
  390. package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +2 -1
  391. package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +3 -5
  392. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
  393. package/deps/rocksdb/rocksdb/util/async_file_reader.h +15 -8
  394. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.cc +131 -0
  395. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.h +90 -0
  396. package/deps/rocksdb/rocksdb/util/autovector.h +1 -1
  397. package/deps/rocksdb/rocksdb/util/autovector_test.cc +2 -2
  398. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +0 -2
  399. package/deps/rocksdb/rocksdb/util/compression.cc +936 -4
  400. package/deps/rocksdb/rocksdb/util/compression.h +348 -232
  401. package/deps/rocksdb/rocksdb/util/compression_test.cc +229 -0
  402. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +10 -10
  403. package/deps/rocksdb/rocksdb/util/crc32c_ppc.c +1 -0
  404. package/deps/rocksdb/rocksdb/util/data_structure.cc +2 -0
  405. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +1 -3
  406. package/deps/rocksdb/rocksdb/util/ppc-opcode.h +5 -5
  407. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +108 -0
  408. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +67 -0
  409. package/deps/rocksdb/rocksdb/util/slice_test.cc +83 -0
  410. package/deps/rocksdb/rocksdb/util/string_util.cc +0 -2
  411. package/deps/rocksdb/rocksdb/util/string_util.h +10 -0
  412. package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
  413. package/deps/rocksdb/rocksdb/util/udt_util.cc +18 -5
  414. package/deps/rocksdb/rocksdb/util/udt_util.h +10 -7
  415. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +650 -154
  416. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +438 -144
  417. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +0 -1
  418. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +0 -1
  419. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +0 -1
  420. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +16 -17
  421. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +2 -1
  422. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +0 -1
  423. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +0 -1
  424. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +7 -8
  425. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +4 -3
  426. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h +0 -1
  427. package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +0 -1
  428. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +2 -2
  429. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  430. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +0 -48
  431. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -1
  432. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +0 -1
  433. package/deps/rocksdb/rocksdb/utilities/debug.cc +7 -14
  434. package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +0 -1
  435. package/deps/rocksdb/rocksdb/utilities/env_mirror_test.cc +0 -2
  436. package/deps/rocksdb/rocksdb/utilities/env_timed.cc +0 -1
  437. package/deps/rocksdb/rocksdb/utilities/env_timed_test.cc +0 -2
  438. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +5 -3
  439. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +10 -9
  440. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +0 -1
  441. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +0 -1
  442. package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +1 -0
  443. package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +0 -2
  444. package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +0 -1
  445. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +0 -1
  446. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +0 -1
  447. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +0 -2
  448. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +0 -2
  449. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc +0 -1
  450. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +0 -2
  451. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table.h +0 -2
  452. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +0 -2
  453. package/deps/rocksdb/rocksdb/utilities/persistent_cache/lrulist.h +0 -2
  454. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h +0 -2
  455. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc +0 -1
  456. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +0 -2
  457. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +0 -1
  458. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +0 -2
  459. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +183 -32
  460. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +258 -12
  461. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_helper.h +33 -0
  462. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_iterator.cc +99 -0
  463. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +280 -120
  464. package/deps/rocksdb/rocksdb/utilities/secondary_index/simple_secondary_index.cc +79 -0
  465. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +52 -16
  466. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h +10 -6
  467. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +55 -0
  468. package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +0 -1
  469. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +0 -2
  470. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h +0 -1
  471. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +37 -12
  472. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +2 -0
  473. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +0 -2
  474. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc +0 -2
  475. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +1 -1
  476. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h +1 -1
  477. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +1 -1
  478. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc +2 -1
  479. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +2 -2
  480. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +0 -1
  481. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.h +0 -2
  482. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +1 -3
  483. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +36 -10
  484. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -7
  485. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +4 -5
  486. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +1 -4
  487. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +1 -2
  488. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +0 -2
  489. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.h +0 -1
  490. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1118 -37
  491. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +4 -7
  492. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +0 -2
  493. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +0 -2
  494. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +3 -3
  495. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +0 -1
  496. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +0 -2
  497. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +1 -2
  498. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +1 -2
  499. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +0 -1
  500. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +0 -3
  501. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +125 -127
  502. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +45 -23
  503. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +54 -22
  504. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +477 -58
  505. package/deps/rocksdb/rocksdb.gyp +9 -4
  506. package/index.js +50 -9
  507. package/package.json +8 -1
  508. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  509. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -67,7 +67,7 @@ Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
67
67
  return s;
68
68
  }
69
69
  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
70
- if (!cfh->cfd()->ioptions()->merge_operator) {
70
+ if (!cfh->cfd()->ioptions().merge_operator) {
71
71
  return Status::NotSupported("Provide a merge_operator when opening DB");
72
72
  } else {
73
73
  return DB::Merge(o, column_family, key, val);
@@ -157,7 +157,7 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
157
157
  if (s.ok()) {
158
158
  s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
159
159
  /*user_write_cb=*/nullptr,
160
- /*log_used=*/nullptr);
160
+ /*wal_used=*/nullptr);
161
161
  }
162
162
  return s;
163
163
  }
@@ -190,11 +190,38 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
190
190
  return s;
191
191
  }
192
192
 
193
- Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
194
- const WBWIMemTable::SeqnoRange& assigned_seqno,
195
- uint64_t prep_log,
196
- SequenceNumber last_seqno_after_ingest,
197
- bool memtable_updated, bool ignore_missing_cf) {
193
+ Status DBImpl::IngestWriteBatchWithIndex(
194
+ const WriteOptions& write_options,
195
+ std::shared_ptr<WriteBatchWithIndex> wbwi) {
196
+ if (!wbwi) {
197
+ return Status::InvalidArgument("Batch is nullptr!");
198
+ }
199
+ if (!write_options.disableWAL) {
200
+ return Status::NotSupported(
201
+ "IngestWriteBatchWithIndex does not support disableWAL=true");
202
+ }
203
+ Status s;
204
+ if (write_options.protection_bytes_per_key > 0) {
205
+ s = WriteBatchInternal::UpdateProtectionInfo(
206
+ wbwi->GetWriteBatch(), write_options.protection_bytes_per_key);
207
+ }
208
+ if (s.ok()) {
209
+ WriteBatch dummy_empty_batch;
210
+ s = WriteImpl(
211
+ write_options, /*updates=*/&dummy_empty_batch, /*callback=*/nullptr,
212
+ /*user_write_cb=*/nullptr, /*log_used=*/nullptr, /*log_ref=*/0,
213
+ /*disable_memtable=*/false, /*seq_used=*/nullptr,
214
+ /*batch_cnt=*/0, /*pre_release_callback=*/nullptr,
215
+ /*post_memtable_callback=*/nullptr, /*wbwi=*/wbwi);
216
+ }
217
+ return s;
218
+ }
219
+
220
+ Status DBImpl::IngestWBWIAsMemtable(
221
+ std::shared_ptr<WriteBatchWithIndex> wbwi,
222
+ const WBWIMemTable::SeqnoRange& assigned_seqno, uint64_t min_prep_log,
223
+ SequenceNumber last_seqno_after_ingest, bool memtable_updated,
224
+ bool ignore_missing_cf) {
198
225
  // Keys in new memtable have seqno > last_seqno_after_ingest >= keys in wbwi.
199
226
  assert(assigned_seqno.upper_bound <= last_seqno_after_ingest);
200
227
  // Keys in the current memtable have seqno <= LastSequence() < keys in wbwi.
@@ -205,7 +232,7 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
205
232
  ColumnFamilySet* cf_set = versions_->GetColumnFamilySet();
206
233
 
207
234
  // Create WBWIMemTables
208
- for (const auto [cf_id, stat] : wbwi->GetCFStats()) {
235
+ for (const auto& [cf_id, stat] : wbwi->GetCFStats()) {
209
236
  ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf_id);
210
237
  if (!cfd) {
211
238
  if (ignore_missing_cf) {
@@ -232,18 +259,36 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
232
259
  return s;
233
260
  }
234
261
  WBWIMemTable* wbwi_memtable =
235
- new WBWIMemTable(wbwi, cfd->user_comparator(), cf_id, cfd->ioptions(),
236
- cfd->GetLatestMutableCFOptions(), stat);
262
+ new WBWIMemTable(wbwi, cfd->user_comparator(), cf_id, &cfd->ioptions(),
263
+ &cfd->GetLatestMutableCFOptions(), stat);
237
264
  wbwi_memtable->Ref();
238
265
  wbwi_memtable->AssignSequenceNumbers(assigned_seqno);
239
266
  // This is needed to keep the WAL that contains Prepare alive until
240
267
  // committed data in this memtable is persisted.
241
- wbwi_memtable->SetMinPrepLog(prep_log);
268
+ wbwi_memtable->SetMinPrepLog(min_prep_log);
242
269
  memtables.push_back(wbwi_memtable);
243
270
  cfd->Ref();
244
271
  cfds.push_back(cfd);
245
272
  }
246
273
 
274
+ autovector<ColumnFamilyData*> cfds_for_atomic_flush;
275
+ if (immutable_db_options_.atomic_flush) {
276
+ SelectColumnFamiliesForAtomicFlush(&cfds_for_atomic_flush);
277
+ for (auto cfd : cfds_for_atomic_flush) {
278
+ bool found = false;
279
+ for (auto existing_cfd : cfds) {
280
+ if (existing_cfd == cfd) {
281
+ found = true;
282
+ break;
283
+ }
284
+ }
285
+ if (!found) {
286
+ cfd->Ref();
287
+ cfds.push_back(cfd);
288
+ }
289
+ }
290
+ }
291
+
247
292
  // Stop writes to the DB by entering both write threads
248
293
  WriteThread::Writer nonmem_w;
249
294
  if (two_write_queues_) {
@@ -253,15 +298,16 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
253
298
 
254
299
  // Switch memtable and add WBWIMemTables
255
300
  Status s;
256
- for (size_t i = 0; i < memtables.size(); ++i) {
257
- assert(!immutable_db_options_.atomic_flush);
258
- // NOTE: to support atomic flush, need to call
259
- // SelectColumnFamiliesForAtomicFlush()
301
+ for (size_t i = 0; i < cfds.size(); ++i) {
260
302
  WriteContext write_context;
261
303
  // TODO: not switch on empty memtable, may need to update metadata
262
304
  // like NextLogNumber(), earliest_seqno and memtable id.
263
- s = SwitchMemtable(cfds[i], &write_context, memtables[i],
264
- last_seqno_after_ingest);
305
+ if (i < memtables.size()) {
306
+ s = SwitchMemtable(cfds[i], &write_context, memtables[i],
307
+ last_seqno_after_ingest);
308
+ } else {
309
+ s = SwitchMemtable(cfds[i], &write_context);
310
+ }
265
311
  if (!s.ok()) {
266
312
  // SwitchMemtable() can only fail if a new WAL is to be created, this
267
313
  // should only happen for the first call to SwitchMemtable(). log will
@@ -301,9 +347,18 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
301
347
  continue;
302
348
  }
303
349
  cfd->imm()->FlushRequested();
350
+ if (!immutable_db_options_.atomic_flush) {
351
+ FlushRequest flush_req;
352
+ // TODO: a new flush reason for ingesting memtable
353
+ GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
354
+ &flush_req);
355
+ EnqueuePendingFlush(flush_req);
356
+ }
357
+ }
358
+ if (immutable_db_options_.atomic_flush) {
359
+ AssignAtomicFlushSeq(cfds);
304
360
  FlushRequest flush_req;
305
- // TODO: a new flush reason for ingesting memtable
306
- GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
361
+ GenerateFlushRequest(cfds, FlushReason::kExternalFileIngestion,
307
362
  &flush_req);
308
363
  EnqueuePendingFlush(flush_req);
309
364
  }
@@ -314,13 +369,12 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
314
369
 
315
370
  Status DBImpl::WriteImpl(const WriteOptions& write_options,
316
371
  WriteBatch* my_batch, WriteCallback* callback,
317
- UserWriteCallback* user_write_cb, uint64_t* log_used,
372
+ UserWriteCallback* user_write_cb, uint64_t* wal_used,
318
373
  uint64_t log_ref, bool disable_memtable,
319
374
  uint64_t* seq_used, size_t batch_cnt,
320
375
  PreReleaseCallback* pre_release_callback,
321
376
  PostMemTableCallback* post_memtable_callback,
322
- std::shared_ptr<WriteBatchWithIndex> wbwi,
323
- uint64_t prep_log) {
377
+ std::shared_ptr<WriteBatchWithIndex> wbwi) {
324
378
  assert(!seq_per_batch_ || batch_cnt != 0);
325
379
  assert(my_batch == nullptr || my_batch->Count() == 0 ||
326
380
  write_options.protection_bytes_per_key == 0 ||
@@ -409,9 +463,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
409
463
  return Status::NotSupported(
410
464
  "DeleteRange is not compatible with row cache.");
411
465
  }
466
+ // Whether the WBWI is from transaction commit or a direct write
467
+ // (IngestWriteBatchWithIndex())
468
+ bool ingest_wbwi_for_commit = false;
412
469
  if (wbwi) {
413
- assert(prep_log > 0);
414
- // Used only in WriteCommittedTxn::CommitInternal() with no `callback`.
470
+ if (my_batch->HasCommit()) {
471
+ ingest_wbwi_for_commit = true;
472
+ assert(log_ref);
473
+ } else {
474
+ // Only supports disableWAL for directly ingesting WBWI for now.
475
+ assert(write_options.disableWAL);
476
+ }
415
477
  assert(!callback);
416
478
  if (immutable_db_options_.unordered_write) {
417
479
  return Status::NotSupported(
@@ -421,9 +483,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
421
483
  return Status::NotSupported(
422
484
  "Ingesting WriteBatch does not support pipelined_write");
423
485
  }
424
- if (immutable_db_options_.atomic_flush) {
486
+ if (!wbwi->GetOverwriteKey()) {
425
487
  return Status::NotSupported(
426
- "Ingesting WriteBatch does not support atomic_flush");
488
+ "WriteBatchWithIndex ingestion requires overwrite_key=true");
427
489
  }
428
490
  }
429
491
  // Otherwise IsLatestPersistentState optimization does not make sense
@@ -444,7 +506,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
444
506
  // they don't consume sequence.
445
507
  return WriteImplWALOnly(
446
508
  &nonmem_write_thread_, write_options, my_batch, callback, user_write_cb,
447
- log_used, log_ref, seq_used, batch_cnt, pre_release_callback,
509
+ wal_used, log_ref, seq_used, batch_cnt, pre_release_callback,
448
510
  assign_order, kDontPublishLastSeq, disable_memtable);
449
511
  }
450
512
 
@@ -458,7 +520,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
458
520
  // sequence in in increasing order, iii) call pre_release_callback serially
459
521
  Status status = WriteImplWALOnly(
460
522
  &write_thread_, write_options, my_batch, callback, user_write_cb,
461
- log_used, log_ref, &seq, sub_batch_cnt, pre_release_callback,
523
+ wal_used, log_ref, &seq, sub_batch_cnt, pre_release_callback,
462
524
  kDoAssignOrder, kDoPublishLastSeq, disable_memtable);
463
525
  TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
464
526
  if (!status.ok()) {
@@ -477,7 +539,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
477
539
 
478
540
  if (immutable_db_options_.enable_pipelined_write) {
479
541
  return PipelinedWriteImpl(write_options, my_batch, callback, user_write_cb,
480
- log_used, log_ref, disable_memtable, seq_used);
542
+ wal_used, log_ref, disable_memtable, seq_used);
481
543
  }
482
544
 
483
545
  PERF_TIMER_GUARD(write_pre_and_post_process_time);
@@ -524,16 +586,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
524
586
  assert(tmp_s.ok());
525
587
  }
526
588
  }
527
- versions_->SetLastSequence(last_sequence);
528
- MemTableInsertStatusCheck(w.status);
589
+ if (w.status.ok()) { // Don't publish a partial batch write
590
+ versions_->SetLastSequence(last_sequence);
591
+ } else {
592
+ HandleMemTableInsertFailure(w.status);
593
+ }
529
594
  write_thread_.ExitAsBatchGroupFollower(&w);
530
595
  }
531
596
  assert(w.state == WriteThread::STATE_COMPLETED);
532
597
  // STATE_COMPLETED conditional below handles exit
533
598
  }
534
599
  if (w.state == WriteThread::STATE_COMPLETED) {
535
- if (log_used != nullptr) {
536
- *log_used = w.log_used;
600
+ if (wal_used != nullptr) {
601
+ *wal_used = w.wal_used;
537
602
  }
538
603
  if (seq_used != nullptr) {
539
604
  *seq_used = w.sequence;
@@ -549,7 +614,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
549
614
  // when it finds suitable, and finish them in the same write batch.
550
615
  // This is how a write job could be done by the other writer.
551
616
  WriteContext write_context;
552
- LogContext log_context(write_options.sync);
617
+ // FIXME: also check disableWAL like others?
618
+ WalContext wal_context(write_options.sync);
553
619
  WriteThread::WriteGroup write_group;
554
620
  bool in_parallel_group = false;
555
621
  uint64_t last_sequence = kMaxSequenceNumber;
@@ -563,7 +629,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
563
629
  // PreprocessWrite does its own perf timing.
564
630
  PERF_TIMER_STOP(write_pre_and_post_process_time);
565
631
 
566
- status = PreprocessWrite(write_options, &log_context, &write_context);
632
+ status = PreprocessWrite(write_options, &wal_context, &write_context);
567
633
  if (!two_write_queues_) {
568
634
  // Assign it after ::PreprocessWrite since the sequence might advance
569
635
  // inside it by WriteRecoverableState
@@ -587,6 +653,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
587
653
 
588
654
  IOStatus io_s;
589
655
  Status pre_release_cb_status;
656
+ size_t seq_inc = 0;
590
657
  if (status.ok()) {
591
658
  // Rules for when we can update the memtable concurrently
592
659
  // 1. supported by memtable
@@ -630,7 +697,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
630
697
  continue;
631
698
  }
632
699
  // TODO: maybe handle the tracing status?
633
- tracer_->Write(writer->batch).PermitUncheckedError();
700
+ if (wbwi && !ingest_wbwi_for_commit) {
701
+ // for transaction write, tracer only needs the commit marker which
702
+ // is in writer->batch
703
+ tracer_->Write(wbwi->GetWriteBatch()).PermitUncheckedError();
704
+ } else {
705
+ tracer_->Write(writer->batch).PermitUncheckedError();
706
+ }
634
707
  }
635
708
  }
636
709
  }
@@ -640,7 +713,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
640
713
  // disable_memtable in between; although we do not write this batch to
641
714
  // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
642
715
  // the seq per valid written key to mem.
643
- size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
716
+ seq_inc = seq_per_batch_ ? valid_batches : total_count;
644
717
  if (wbwi) {
645
718
  // Reserve sequence numbers for the ingested memtable. We need to reserve
646
719
  // at lease this amount for recovery. During recovery,
@@ -688,22 +761,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
688
761
 
689
762
  if (!two_write_queues_) {
690
763
  if (status.ok() && !write_options.disableWAL) {
691
- assert(log_context.log_file_number_size);
692
- LogFileNumberSize& log_file_number_size =
693
- *(log_context.log_file_number_size);
764
+ assert(wal_context.wal_file_number_size);
765
+ wal_context.prev_size = wal_context.writer->file()->GetFileSize();
694
766
  PERF_TIMER_GUARD(write_wal_time);
695
- io_s =
696
- WriteToWAL(write_group, log_context.writer, log_used,
697
- log_context.need_log_sync, log_context.need_log_dir_sync,
698
- last_sequence + 1, log_file_number_size);
767
+ io_s = WriteGroupToWAL(write_group, wal_context.writer, wal_used,
768
+ wal_context.need_wal_sync,
769
+ wal_context.need_wal_dir_sync, last_sequence + 1,
770
+ *wal_context.wal_file_number_size);
699
771
  }
700
772
  } else {
701
773
  if (status.ok() && !write_options.disableWAL) {
702
774
  PERF_TIMER_GUARD(write_wal_time);
703
775
  // LastAllocatedSequence is increased inside WriteToWAL under
704
776
  // wal_write_mutex_ to ensure ordered events in WAL
705
- io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
706
- seq_inc);
777
+ io_s = ConcurrentWriteGroupToWAL(write_group, wal_used, &last_sequence,
778
+ seq_inc);
707
779
  } else {
708
780
  // Otherwise we inc seq number for memtable writes
709
781
  last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
@@ -713,17 +785,18 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
713
785
  assert(last_sequence != kMaxSequenceNumber);
714
786
  const SequenceNumber current_sequence = last_sequence + 1;
715
787
  last_sequence += seq_inc;
788
+ // Seqno assigned to this write are [current_sequence, last_sequence]
716
789
 
717
- if (log_context.need_log_sync) {
790
+ if (wal_context.need_wal_sync) {
718
791
  VersionEdit synced_wals;
719
- log_write_mutex_.Lock();
792
+ wal_write_mutex_.Lock();
720
793
  if (status.ok()) {
721
- MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
794
+ MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync,
722
795
  &synced_wals);
723
796
  } else {
724
- MarkLogsNotSynced(logfile_number_);
797
+ MarkLogsNotSynced(cur_wal_number_);
725
798
  }
726
- log_write_mutex_.Unlock();
799
+ wal_write_mutex_.Unlock();
727
800
  if (status.ok() && synced_wals.IsWalAddition()) {
728
801
  InstrumentedMutexLock l(&mutex_);
729
802
  // TODO: plumb Env::IOActivity, Env::IOPriority
@@ -758,7 +831,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
758
831
  writer->sequence = next_sequence;
759
832
  if (writer->pre_release_callback) {
760
833
  Status ws = writer->pre_release_callback->Callback(
761
- writer->sequence, disable_memtable, writer->log_used, index++,
834
+ writer->sequence, disable_memtable, writer->wal_used, index++,
762
835
  pre_release_callback_cnt);
763
836
  if (!ws.ok()) {
764
837
  status = pre_release_cb_status = ws;
@@ -783,8 +856,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
783
856
  write_group, current_sequence, column_family_memtables_.get(),
784
857
  &flush_scheduler_, &trim_history_scheduler_,
785
858
  write_options.ignore_missing_column_families,
786
- 0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
787
- batch_per_txn_);
859
+ 0 /*recovery_log_number*/, this, seq_per_batch_, batch_per_txn_);
788
860
  } else {
789
861
  write_group.last_sequence = last_sequence;
790
862
  write_thread_.LaunchParallelMemTableWriters(&write_group);
@@ -832,24 +904,31 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
832
904
  // handle exit, false means somebody else did
833
905
  should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
834
906
  }
835
- if (wbwi) {
836
- if (status.ok() && w.status.ok()) {
907
+ if (wbwi && status.ok() && w.status.ok()) {
908
+ uint32_t wbwi_count = wbwi->GetWriteBatch()->Count();
909
+ // skip empty batch case
910
+ if (wbwi_count) {
837
911
  // w.batch contains (potentially empty) commit time batch updates,
838
912
  // only ingest wbwi if w.batch is applied to memtable successfully
839
- assert(wbwi->GetWriteBatch()->Count() > 0);
840
-
841
913
  uint32_t memtable_update_count = w.batch->Count();
842
- SequenceNumber lb = versions_->LastSequence() + memtable_update_count + 1;
843
- SequenceNumber ub = versions_->LastSequence() + memtable_update_count +
844
- wbwi->GetWriteBatch()->Count();
845
- assert(ub == last_sequence);
914
+ // Seqno assigned to this write are [last_seq + 1 - seq_inc, last_seq].
915
+ // seq_inc includes w.batch (memtable updates) and wbwi
916
+ // w.batch gets first `memtable_update_count` sequence numbers.
917
+ // wbwi gets the rest `wbwi_count` sequence numbers.
918
+ assert(seq_inc == memtable_update_count + wbwi_count);
919
+ assert(wbwi_count > 0);
920
+ assert(last_sequence != kMaxSequenceNumber);
921
+ SequenceNumber lb = last_sequence + 1 - wbwi_count;
922
+ SequenceNumber ub = last_sequence;
846
923
  if (two_write_queues_) {
847
924
  assert(ub <= versions_->LastAllocatedSequence());
848
925
  }
849
- status = IngestWBWI(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
850
- prep_log, last_sequence,
851
- /*memtable_updated=*/memtable_update_count > 0,
852
- write_options.ignore_missing_column_families);
926
+ status =
927
+ IngestWBWIAsMemtable(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
928
+ /*min_prep_log=*/log_ref, last_sequence,
929
+ /*memtable_updated=*/memtable_update_count > 0,
930
+ write_options.ignore_missing_column_families);
931
+ RecordTick(stats_, NUMBER_WBWI_INGEST);
853
932
  }
854
933
  }
855
934
 
@@ -867,9 +946,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
867
946
  }
868
947
  // Note: if we are to resume after non-OK statuses we need to revisit how
869
948
  // we react to non-OK statuses here.
870
- versions_->SetLastSequence(last_sequence);
949
+ if (w.status.ok()) { // Don't publish a partial batch write
950
+ versions_->SetLastSequence(last_sequence);
951
+ }
952
+ }
953
+ if (!w.status.ok()) {
954
+ if (wal_context.prev_size < SIZE_MAX) {
955
+ InstrumentedMutexLock l(&wal_write_mutex_);
956
+ if (logs_.back().number == wal_context.wal_file_number_size->number) {
957
+ logs_.back().SetAttemptTruncateSize(wal_context.prev_size);
958
+ }
959
+ }
960
+ HandleMemTableInsertFailure(w.status);
871
961
  }
872
- MemTableInsertStatusCheck(w.status);
873
962
  write_thread_.ExitAsBatchGroupLeader(write_group, status);
874
963
  }
875
964
 
@@ -882,7 +971,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
882
971
  Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
883
972
  WriteBatch* my_batch, WriteCallback* callback,
884
973
  UserWriteCallback* user_write_cb,
885
- uint64_t* log_used, uint64_t log_ref,
974
+ uint64_t* wal_used, uint64_t log_ref,
886
975
  bool disable_memtable, uint64_t* seq_used) {
887
976
  PERF_TIMER_GUARD(write_pre_and_post_process_time);
888
977
  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
@@ -899,10 +988,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
899
988
  if (w.callback && !w.callback->AllowWriteBatching()) {
900
989
  write_thread_.WaitForMemTableWriters();
901
990
  }
902
- LogContext log_context(!write_options.disableWAL && write_options.sync);
991
+ WalContext wal_context(!write_options.disableWAL && write_options.sync);
903
992
  // PreprocessWrite does its own perf timing.
904
993
  PERF_TIMER_STOP(write_pre_and_post_process_time);
905
- w.status = PreprocessWrite(write_options, &log_context, &write_context);
994
+ w.status = PreprocessWrite(write_options, &wal_context, &write_context);
906
995
  PERF_TIMER_START(write_pre_and_post_process_time);
907
996
 
908
997
  // This can set non-OK status if callback fail.
@@ -971,13 +1060,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
971
1060
  wal_write_group.size - 1);
972
1061
  RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
973
1062
  }
974
- assert(log_context.log_file_number_size);
975
- LogFileNumberSize& log_file_number_size =
976
- *(log_context.log_file_number_size);
977
- io_s =
978
- WriteToWAL(wal_write_group, log_context.writer, log_used,
979
- log_context.need_log_sync, log_context.need_log_dir_sync,
980
- current_sequence, log_file_number_size);
1063
+ assert(wal_context.wal_file_number_size);
1064
+ WalFileNumberSize& wal_file_number_size =
1065
+ *(wal_context.wal_file_number_size);
1066
+ io_s = WriteGroupToWAL(wal_write_group, wal_context.writer, wal_used,
1067
+ wal_context.need_wal_sync,
1068
+ wal_context.need_wal_dir_sync, current_sequence,
1069
+ wal_file_number_size);
981
1070
  w.status = io_s;
982
1071
  }
983
1072
 
@@ -989,13 +1078,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
989
1078
  }
990
1079
 
991
1080
  VersionEdit synced_wals;
992
- if (log_context.need_log_sync) {
993
- InstrumentedMutexLock l(&log_write_mutex_);
1081
+ if (wal_context.need_wal_sync) {
1082
+ InstrumentedMutexLock l(&wal_write_mutex_);
994
1083
  if (w.status.ok()) {
995
- MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
1084
+ MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync,
996
1085
  &synced_wals);
997
1086
  } else {
998
- MarkLogsNotSynced(logfile_number_);
1087
+ MarkLogsNotSynced(cur_wal_number_);
999
1088
  }
1000
1089
  }
1001
1090
  if (w.status.ok() && synced_wals.IsWalAddition()) {
@@ -1025,8 +1114,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
1025
1114
  memtable_write_group, w.sequence, column_family_memtables_.get(),
1026
1115
  &flush_scheduler_, &trim_history_scheduler_,
1027
1116
  write_options.ignore_missing_column_families, 0 /*log_number*/, this,
1028
- false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
1029
- versions_->SetLastSequence(memtable_write_group.last_sequence);
1117
+ seq_per_batch_, batch_per_txn_);
1118
+ if (memtable_write_group.status
1119
+ .ok()) { // Don't publish a partial batch write
1120
+ versions_->SetLastSequence(memtable_write_group.last_sequence);
1121
+ } else {
1122
+ HandleMemTableInsertFailure(memtable_write_group.status);
1123
+ }
1030
1124
  write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
1031
1125
  }
1032
1126
  } else {
@@ -1055,8 +1149,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
1055
1149
  PERF_TIMER_START(write_pre_and_post_process_time);
1056
1150
 
1057
1151
  if (write_thread_.CompleteParallelMemTableWriter(&w)) {
1058
- MemTableInsertStatusCheck(w.status);
1059
- versions_->SetLastSequence(w.write_group->last_sequence);
1152
+ if (w.status.ok()) { // Don't publish a partial batch write
1153
+ versions_->SetLastSequence(w.write_group->last_sequence);
1154
+ } else {
1155
+ HandleMemTableInsertFailure(w.status);
1156
+ }
1060
1157
  write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
1061
1158
  }
1062
1159
  }
@@ -1128,7 +1225,7 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
1128
1225
  Status DBImpl::WriteImplWALOnly(
1129
1226
  WriteThread* write_thread, const WriteOptions& write_options,
1130
1227
  WriteBatch* my_batch, WriteCallback* callback,
1131
- UserWriteCallback* user_write_cb, uint64_t* log_used,
1228
+ UserWriteCallback* user_write_cb, uint64_t* wal_used,
1132
1229
  const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
1133
1230
  PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
1134
1231
  const PublishLastSeq publish_last_seq, const bool disable_memtable) {
@@ -1141,8 +1238,8 @@ Status DBImpl::WriteImplWALOnly(
1141
1238
  write_thread->JoinBatchGroup(&w);
1142
1239
  assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
1143
1240
  if (w.state == WriteThread::STATE_COMPLETED) {
1144
- if (log_used != nullptr) {
1145
- *log_used = w.log_used;
1241
+ if (wal_used != nullptr) {
1242
+ *wal_used = w.wal_used;
1146
1243
  }
1147
1244
  if (seq_used != nullptr) {
1148
1245
  *seq_used = w.sequence;
@@ -1158,10 +1255,10 @@ Status DBImpl::WriteImplWALOnly(
1158
1255
 
1159
1256
  // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
1160
1257
  // without paying the cost of obtaining the mutex.
1161
- LogContext log_context;
1258
+ WalContext wal_context;
1162
1259
  WriteContext write_context;
1163
1260
  Status status =
1164
- PreprocessWrite(write_options, &log_context, &write_context);
1261
+ PreprocessWrite(write_options, &wal_context, &write_context);
1165
1262
  WriteStatusCheckOnLocked(status);
1166
1263
 
1167
1264
  if (!status.ok()) {
@@ -1258,8 +1355,8 @@ Status DBImpl::WriteImplWALOnly(
1258
1355
  }
1259
1356
  Status status;
1260
1357
  if (!write_options.disableWAL) {
1261
- IOStatus io_s =
1262
- ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
1358
+ IOStatus io_s = ConcurrentWriteGroupToWAL(write_group, wal_used,
1359
+ &last_sequence, seq_inc);
1263
1360
  status = io_s;
1264
1361
  // last_sequence may not be set if there is an error
1265
1362
  // This error checking and return is moved up to avoid using uninitialized
@@ -1311,7 +1408,7 @@ Status DBImpl::WriteImplWALOnly(
1311
1408
  if (!writer->CallbackFailed() && writer->pre_release_callback) {
1312
1409
  assert(writer->sequence != kMaxSequenceNumber);
1313
1410
  Status ws = writer->pre_release_callback->Callback(
1314
- writer->sequence, disable_memtable, writer->log_used, index++,
1411
+ writer->sequence, disable_memtable, writer->wal_used, index++,
1315
1412
  pre_release_callback_cnt);
1316
1413
  if (!ws.ok()) {
1317
1414
  status = ws;
@@ -1380,24 +1477,22 @@ void DBImpl::WALIOStatusCheck(const IOStatus& io_status) {
1380
1477
  }
1381
1478
  }
1382
1479
 
1383
- void DBImpl::MemTableInsertStatusCheck(const Status& status) {
1384
- // A non-OK status here indicates that the state implied by the
1385
- // WAL has diverged from the in-memory state. This could be
1386
- // because of a corrupt write_batch (very bad), or because the
1387
- // client specified an invalid column family and didn't specify
1388
- // ignore_missing_column_families.
1389
- if (!status.ok()) {
1390
- mutex_.Lock();
1391
- assert(!error_handler_.IsBGWorkStopped());
1392
- error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
1393
- mutex_.Unlock();
1394
- }
1480
+ void DBImpl::HandleMemTableInsertFailure(const Status& status) {
1481
+ assert(!status.ok());
1482
+ // A non-OK status on memtable insert indicates that the state implied by the
1483
+ // WAL has diverged from the in-memory state. This could be because of a
1484
+ // corrupt write_batch (very bad), or because the client specified an invalid
1485
+ // column family and didn't specify ignore_missing_column_families.
1486
+ mutex_.Lock();
1487
+ assert(!error_handler_.IsBGWorkStopped());
1488
+ error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
1489
+ mutex_.Unlock();
1395
1490
  }
1396
1491
 
1397
1492
  Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1398
- LogContext* log_context,
1493
+ WalContext* wal_context,
1399
1494
  WriteContext* write_context) {
1400
- assert(write_context != nullptr && log_context != nullptr);
1495
+ assert(write_context != nullptr && wal_context != nullptr);
1401
1496
  Status status;
1402
1497
 
1403
1498
  if (error_handler_.IsDBStopped()) {
@@ -1407,7 +1502,8 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1407
1502
 
1408
1503
  PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
1409
1504
 
1410
- if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
1505
+ if (UNLIKELY(status.ok() &&
1506
+ wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize())) {
1411
1507
  assert(versions_);
1412
1508
  InstrumentedMutexLock l(&mutex_);
1413
1509
  const ColumnFamilySet* const column_families =
@@ -1476,17 +1572,17 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1476
1572
  WriteBufferManagerStallWrites();
1477
1573
  }
1478
1574
  }
1479
- InstrumentedMutexLock l(&log_write_mutex_);
1480
- if (status.ok() && log_context->need_log_sync) {
1575
+ InstrumentedMutexLock l(&wal_write_mutex_);
1576
+ if (status.ok() && wal_context->need_wal_sync) {
1481
1577
  // Wait until the parallel syncs are finished. Any sync process has to sync
1482
1578
  // the front log too so it is enough to check the status of front()
1483
- // We do a while loop since log_sync_cv_ is signalled when any sync is
1579
+ // We do a while loop since wal_sync_cv_ is signalled when any sync is
1484
1580
  // finished
1485
1581
  // Note: there does not seem to be a reason to wait for parallel sync at
1486
1582
  // this early step but it is not important since parallel sync (SyncWAL) and
1487
- // need_log_sync are usually not used together.
1583
+ // need_wal_sync are usually not used together.
1488
1584
  while (logs_.front().IsSyncing()) {
1489
- log_sync_cv_.Wait();
1585
+ wal_sync_cv_.Wait();
1490
1586
  }
1491
1587
  for (auto& log : logs_) {
1492
1588
  // This is just to prevent the logs to be synced by a parallel SyncWAL
@@ -1497,12 +1593,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1497
1593
  log.PrepareForSync();
1498
1594
  }
1499
1595
  } else {
1500
- log_context->need_log_sync = false;
1596
+ wal_context->need_wal_sync = false;
1501
1597
  }
1502
- log_context->writer = logs_.back().writer;
1503
- log_context->need_log_dir_sync =
1504
- log_context->need_log_dir_sync && !log_dir_synced_;
1505
- log_context->log_file_number_size = std::addressof(alive_log_files_.back());
1598
+ wal_context->writer = logs_.back().writer;
1599
+ wal_context->need_wal_dir_sync =
1600
+ wal_context->need_wal_dir_sync && !wal_dir_synced_;
1601
+ wal_context->wal_file_number_size = std::addressof(alive_wal_files_.back());
1506
1602
 
1507
1603
  return status;
1508
1604
  }
@@ -1553,12 +1649,13 @@ Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
1553
1649
  }
1554
1650
 
1555
1651
  // When two_write_queues_ is disabled, this function is called from the only
1556
- // write thread. Otherwise this must be called holding log_write_mutex_.
1652
+ // write thread. Otherwise this must be called holding wal_write_mutex_.
1557
1653
  IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
1558
1654
  const WriteOptions& write_options,
1559
- log::Writer* log_writer, uint64_t* log_used,
1655
+ log::Writer* log_writer, uint64_t* wal_used,
1560
1656
  uint64_t* log_size,
1561
- LogFileNumberSize& log_file_number_size) {
1657
+ WalFileNumberSize& wal_file_number_size,
1658
+ SequenceNumber sequence) {
1562
1659
  assert(log_size != nullptr);
1563
1660
 
1564
1661
  Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
@@ -1569,7 +1666,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
1569
1666
  }
1570
1667
  *log_size = log_entry.size();
1571
1668
  // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
1572
- // from the two queues anyway and log_write_mutex_ is already held. Otherwise
1669
+ // from the two queues anyway and wal_write_mutex_ is already held. Otherwise
1573
1670
  // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
1574
1671
  // from possible concurrent calls via the FlushWAL by the application.
1575
1672
  const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
@@ -1577,33 +1674,34 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
1577
1674
  // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
1578
1675
  // when we do not need any locking.
1579
1676
  if (UNLIKELY(needs_locking)) {
1580
- log_write_mutex_.Lock();
1677
+ wal_write_mutex_.Lock();
1581
1678
  }
1582
1679
  IOStatus io_s = log_writer->MaybeAddUserDefinedTimestampSizeRecord(
1583
1680
  write_options, versions_->GetColumnFamiliesTimestampSizeForRecord());
1584
1681
  if (!io_s.ok()) {
1585
1682
  return io_s;
1586
1683
  }
1587
- io_s = log_writer->AddRecord(write_options, log_entry);
1684
+ io_s = log_writer->AddRecord(write_options, log_entry, sequence);
1588
1685
 
1589
1686
  if (UNLIKELY(needs_locking)) {
1590
- log_write_mutex_.Unlock();
1687
+ wal_write_mutex_.Unlock();
1591
1688
  }
1592
- if (log_used != nullptr) {
1593
- *log_used = logfile_number_;
1689
+ if (wal_used != nullptr) {
1690
+ *wal_used = cur_wal_number_;
1691
+ assert(*wal_used == wal_file_number_size.number);
1594
1692
  }
1595
- total_log_size_ += log_entry.size();
1596
- log_file_number_size.AddSize(*log_size);
1597
- log_empty_ = false;
1693
+ wals_total_size_.FetchAddRelaxed(log_entry.size());
1694
+ wal_file_number_size.AddSize(*log_size);
1695
+ wal_empty_ = false;
1598
1696
 
1599
1697
  return io_s;
1600
1698
  }
1601
1699
 
1602
- IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
1603
- log::Writer* log_writer, uint64_t* log_used,
1604
- bool need_log_sync, bool need_log_dir_sync,
1605
- SequenceNumber sequence,
1606
- LogFileNumberSize& log_file_number_size) {
1700
+ IOStatus DBImpl::WriteGroupToWAL(const WriteThread::WriteGroup& write_group,
1701
+ log::Writer* log_writer, uint64_t* wal_used,
1702
+ bool need_wal_sync, bool need_wal_dir_sync,
1703
+ SequenceNumber sequence,
1704
+ WalFileNumberSize& wal_file_number_size) {
1607
1705
  IOStatus io_s;
1608
1706
  assert(!two_write_queues_);
1609
1707
  assert(!write_group.leader->disable_wal);
@@ -1618,10 +1716,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
1618
1716
  }
1619
1717
 
1620
1718
  if (merged_batch == write_group.leader->batch) {
1621
- write_group.leader->log_used = logfile_number_;
1719
+ write_group.leader->wal_used = cur_wal_number_;
1622
1720
  } else if (write_with_wal > 1) {
1623
1721
  for (auto writer : write_group) {
1624
- writer->log_used = logfile_number_;
1722
+ writer->wal_used = cur_wal_number_;
1625
1723
  }
1626
1724
  }
1627
1725
 
@@ -1633,14 +1731,14 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
1633
1731
  WriteOptions write_options;
1634
1732
  write_options.rate_limiter_priority =
1635
1733
  write_group.leader->rate_limiter_priority;
1636
- io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used,
1637
- &log_size, log_file_number_size);
1734
+ io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used,
1735
+ &log_size, wal_file_number_size, sequence);
1638
1736
  if (to_be_cached_state) {
1639
1737
  cached_recoverable_state_ = *to_be_cached_state;
1640
1738
  cached_recoverable_state_empty_ = false;
1641
1739
  }
1642
1740
 
1643
- if (io_s.ok() && need_log_sync) {
1741
+ if (io_s.ok() && need_wal_sync) {
1644
1742
  StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
1645
1743
  // It's safe to access logs_ with unlocked mutex_ here because:
1646
1744
  // - we've set getting_synced=true for all logs,
@@ -1650,15 +1748,15 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
1650
1748
  // - as long as other threads don't modify it, it's safe to read
1651
1749
  // from std::deque from multiple threads concurrently.
1652
1750
  //
1653
- // Sync operation should work with locked log_write_mutex_, because:
1751
+ // Sync operation should work with locked wal_write_mutex_, because:
1654
1752
  // when DBOptions.manual_wal_flush_ is set,
1655
1753
  // FlushWAL function will be invoked by another thread.
1656
- // if without locked log_write_mutex_, the log file may get data
1754
+ // if without locked wal_write_mutex_, the log file may get data
1657
1755
  // corruption
1658
1756
 
1659
1757
  const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
1660
1758
  if (UNLIKELY(needs_locking)) {
1661
- log_write_mutex_.Lock();
1759
+ wal_write_mutex_.Lock();
1662
1760
  }
1663
1761
 
1664
1762
  if (io_s.ok()) {
@@ -1681,10 +1779,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
1681
1779
  }
1682
1780
 
1683
1781
  if (UNLIKELY(needs_locking)) {
1684
- log_write_mutex_.Unlock();
1782
+ wal_write_mutex_.Unlock();
1685
1783
  }
1686
1784
 
1687
- if (io_s.ok() && need_log_dir_sync) {
1785
+ if (io_s.ok() && need_wal_dir_sync) {
1688
1786
  // We only sync WAL directory the first time WAL syncing is
1689
1787
  // requested, so that in case users never turn on WAL sync,
1690
1788
  // we can avoid the disk I/O in the write code path.
@@ -1699,7 +1797,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
1699
1797
  }
1700
1798
  if (io_s.ok()) {
1701
1799
  auto stats = default_cf_internal_stats_;
1702
- if (need_log_sync) {
1800
+ if (need_wal_sync) {
1703
1801
  stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
1704
1802
  RecordTick(stats_, WAL_FILE_SYNCED);
1705
1803
  }
@@ -1716,8 +1814,8 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
1716
1814
  return io_s;
1717
1815
  }
1718
1816
 
1719
- IOStatus DBImpl::ConcurrentWriteToWAL(
1720
- const WriteThread::WriteGroup& write_group, uint64_t* log_used,
1817
+ IOStatus DBImpl::ConcurrentWriteGroupToWAL(
1818
+ const WriteThread::WriteGroup& write_group, uint64_t* wal_used,
1721
1819
  SequenceNumber* last_sequence, size_t seq_inc) {
1722
1820
  IOStatus io_s;
1723
1821
 
@@ -1734,14 +1832,14 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
1734
1832
  return io_s;
1735
1833
  }
1736
1834
 
1737
- // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
1835
+ // We need to lock wal_write_mutex_ since logs_ and alive_wal_files might be
1738
1836
  // pushed back concurrently
1739
- log_write_mutex_.Lock();
1837
+ wal_write_mutex_.Lock();
1740
1838
  if (merged_batch == write_group.leader->batch) {
1741
- write_group.leader->log_used = logfile_number_;
1839
+ write_group.leader->wal_used = cur_wal_number_;
1742
1840
  } else if (write_with_wal > 1) {
1743
1841
  for (auto writer : write_group) {
1744
- writer->log_used = logfile_number_;
1842
+ writer->wal_used = cur_wal_number_;
1745
1843
  }
1746
1844
  }
1747
1845
  *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
@@ -1749,9 +1847,9 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
1749
1847
  WriteBatchInternal::SetSequence(merged_batch, sequence);
1750
1848
 
1751
1849
  log::Writer* log_writer = logs_.back().writer;
1752
- LogFileNumberSize& log_file_number_size = alive_log_files_.back();
1850
+ WalFileNumberSize& wal_file_number_size = alive_wal_files_.back();
1753
1851
 
1754
- assert(log_writer->get_log_number() == log_file_number_size.number);
1852
+ assert(log_writer->get_log_number() == wal_file_number_size.number);
1755
1853
 
1756
1854
  uint64_t log_size;
1757
1855
 
@@ -1759,13 +1857,13 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
1759
1857
  WriteOptions write_options;
1760
1858
  write_options.rate_limiter_priority =
1761
1859
  write_group.leader->rate_limiter_priority;
1762
- io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used,
1763
- &log_size, log_file_number_size);
1860
+ io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used,
1861
+ &log_size, wal_file_number_size, sequence);
1764
1862
  if (to_be_cached_state) {
1765
1863
  cached_recoverable_state_ = *to_be_cached_state;
1766
1864
  cached_recoverable_state_empty_ = false;
1767
1865
  }
1768
- log_write_mutex_.Unlock();
1866
+ wal_write_mutex_.Unlock();
1769
1867
 
1770
1868
  if (io_s.ok()) {
1771
1869
  const bool concurrent = true;
@@ -1793,7 +1891,7 @@ Status DBImpl::WriteRecoverableState() {
1793
1891
  bool dont_care_bool;
1794
1892
  SequenceNumber next_seq;
1795
1893
  if (two_write_queues_) {
1796
- log_write_mutex_.Lock();
1894
+ wal_write_mutex_.Lock();
1797
1895
  }
1798
1896
  SequenceNumber seq;
1799
1897
  if (two_write_queues_) {
@@ -1808,13 +1906,17 @@ Status DBImpl::WriteRecoverableState() {
1808
1906
  0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
1809
1907
  &next_seq, &dont_care_bool, seq_per_batch_);
1810
1908
  auto last_seq = next_seq - 1;
1811
- if (two_write_queues_) {
1812
- versions_->FetchAddLastAllocatedSequence(last_seq - seq);
1813
- versions_->SetLastPublishedSequence(last_seq);
1909
+ if (status.ok()) { // Don't publish a partial batch write
1910
+ if (two_write_queues_) {
1911
+ versions_->FetchAddLastAllocatedSequence(last_seq - seq);
1912
+ versions_->SetLastPublishedSequence(last_seq);
1913
+ }
1914
+ versions_->SetLastSequence(last_seq);
1915
+ } else {
1916
+ HandleMemTableInsertFailure(status);
1814
1917
  }
1815
- versions_->SetLastSequence(last_seq);
1816
1918
  if (two_write_queues_) {
1817
- log_write_mutex_.Unlock();
1919
+ wal_write_mutex_.Unlock();
1818
1920
  }
1819
1921
  if (status.ok() && recoverable_state_pre_release_callback_) {
1820
1922
  const bool DISABLE_MEMTABLE = true;
@@ -1886,7 +1988,10 @@ void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
1886
1988
  assert(immutable_db_options_.atomic_flush);
1887
1989
  auto seq = versions_->LastSequence();
1888
1990
  for (auto cfd : cfds) {
1889
- cfd->imm()->AssignAtomicFlushSeq(seq);
1991
+ // cfd can be nullptr, see ScheduleFlushes()
1992
+ if (cfd) {
1993
+ cfd->imm()->AssignAtomicFlushSeq(seq);
1994
+ }
1890
1995
  }
1891
1996
  }
1892
1997
 
@@ -1895,11 +2000,11 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
1895
2000
  assert(write_context != nullptr);
1896
2001
  Status status;
1897
2002
 
1898
- if (alive_log_files_.begin()->getting_flushed) {
2003
+ if (alive_wal_files_.begin()->getting_flushed) {
1899
2004
  return status;
1900
2005
  }
1901
2006
 
1902
- auto oldest_alive_log = alive_log_files_.begin()->number;
2007
+ auto oldest_alive_log = alive_wal_files_.begin()->number;
1903
2008
  bool flush_wont_release_oldest_log = false;
1904
2009
  if (allow_2pc()) {
1905
2010
  auto oldest_log_with_uncommitted_prep =
@@ -1929,14 +2034,14 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
1929
2034
  // transactions then we cannot flush this log until those transactions are
1930
2035
  // commited.
1931
2036
  unable_to_release_oldest_log_ = false;
1932
- alive_log_files_.begin()->getting_flushed = true;
2037
+ alive_wal_files_.begin()->getting_flushed = true;
1933
2038
  }
1934
2039
 
1935
2040
  ROCKS_LOG_INFO(
1936
2041
  immutable_db_options_.info_log,
1937
2042
  "Flushing all column families with data in WAL number %" PRIu64
1938
2043
  ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
1939
- oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
2044
+ oldest_alive_log, wals_total_size_.LoadRelaxed(), GetMaxTotalWalSize());
1940
2045
  // no need to refcount because drop is happening in write thread, so can't
1941
2046
  // happen while we're in the write thread
1942
2047
  autovector<ColumnFamilyData*> cfds;
@@ -2406,22 +2511,24 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2406
2511
  // Do this without holding the dbmutex lock.
2407
2512
  assert(versions_->prev_log_number() == 0);
2408
2513
  if (two_write_queues_) {
2409
- log_write_mutex_.Lock();
2514
+ wal_write_mutex_.Lock();
2410
2515
  }
2411
- bool creating_new_log = !log_empty_;
2516
+ bool creating_new_log = !wal_empty_;
2412
2517
  if (two_write_queues_) {
2413
- log_write_mutex_.Unlock();
2518
+ wal_write_mutex_.Unlock();
2414
2519
  }
2415
2520
  uint64_t recycle_log_number = 0;
2416
2521
  // If file deletion is disabled, don't recycle logs since it'll result in
2417
2522
  // the file getting renamed
2418
2523
  if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
2419
- !log_recycle_files_.empty() && IsFileDeletionsEnabled()) {
2420
- recycle_log_number = log_recycle_files_.front();
2524
+ !wal_recycle_files_.empty() && IsFileDeletionsEnabled()) {
2525
+ recycle_log_number = wal_recycle_files_.front();
2421
2526
  }
2422
2527
  uint64_t new_log_number =
2423
- creating_new_log ? versions_->NewFileNumber() : logfile_number_;
2424
- const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
2528
+ creating_new_log ? versions_->NewFileNumber() : cur_wal_number_;
2529
+ // For use outside of holding DB mutex
2530
+ const MutableCFOptions mutable_cf_options_copy =
2531
+ cfd->GetLatestMutableCFOptions();
2425
2532
 
2426
2533
  // Set memtable_info for memtable sealed callback
2427
2534
  // TODO: memtable_info for `new_imm`
@@ -2431,7 +2538,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2431
2538
  memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
2432
2539
  memtable_info.num_entries = cfd->mem()->NumEntries();
2433
2540
  memtable_info.num_deletes = cfd->mem()->NumDeletion();
2434
- if (!cfd->ioptions()->persist_user_defined_timestamps &&
2541
+ if (!cfd->ioptions().persist_user_defined_timestamps &&
2435
2542
  cfd->user_comparator()->timestamp_size() > 0) {
2436
2543
  const Slice& newest_udt = cfd->mem()->GetNewestUDT();
2437
2544
  memtable_info.newest_udt.assign(newest_udt.data(), newest_udt.size());
@@ -2440,13 +2547,22 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2440
2547
  // flush happens before logging, but that should be ok.
2441
2548
  int num_imm_unflushed = cfd->imm()->NumNotFlushed();
2442
2549
  const auto preallocate_block_size =
2443
- GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
2550
+ GetWalPreallocateBlockSize(mutable_cf_options_copy.write_buffer_size);
2444
2551
  mutex_.Unlock();
2445
2552
  if (creating_new_log) {
2553
+ PredecessorWALInfo info;
2554
+ wal_write_mutex_.Lock();
2555
+ if (!logs_.empty()) {
2556
+ log::Writer* cur_log_writer = logs_.back().writer;
2557
+ info = PredecessorWALInfo(cur_log_writer->get_log_number(),
2558
+ cur_log_writer->file()->GetFileSize(),
2559
+ cur_log_writer->GetLastSeqnoRecorded());
2560
+ }
2561
+ wal_write_mutex_.Unlock();
2446
2562
  // TODO: Write buffer size passed in should be max of all CF's instead
2447
2563
  // of mutable_cf_options.write_buffer_size.
2448
2564
  io_s = CreateWAL(write_options, new_log_number, recycle_log_number,
2449
- preallocate_block_size, &new_log);
2565
+ preallocate_block_size, info, &new_log);
2450
2566
  if (s.ok()) {
2451
2567
  s = io_s;
2452
2568
  }
@@ -2464,8 +2580,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2464
2580
  } else {
2465
2581
  seq = versions_->LastSequence();
2466
2582
  }
2467
- new_mem =
2468
- cfd->ConstructNewMemtable(mutable_cf_options, /*earliest_seq=*/seq);
2583
+ new_mem = cfd->ConstructNewMemtable(mutable_cf_options_copy,
2584
+ /*earliest_seq=*/seq);
2469
2585
  context->superversion_context.NewSuperVersion();
2470
2586
 
2471
2587
  ROCKS_LOG_INFO(immutable_db_options_.info_log,
@@ -2483,11 +2599,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2483
2599
  // concurrent full purges don't delete the file while we're recycling it.
2484
2600
  // To achieve that we hold the old log number in the recyclable list until
2485
2601
  // after it has been renamed.
2486
- assert(log_recycle_files_.front() == recycle_log_number);
2487
- log_recycle_files_.pop_front();
2602
+ assert(wal_recycle_files_.front() == recycle_log_number);
2603
+ wal_recycle_files_.pop_front();
2488
2604
  }
2489
2605
  if (s.ok() && creating_new_log) {
2490
- InstrumentedMutexLock l(&log_write_mutex_);
2606
+ InstrumentedMutexLock l(&wal_write_mutex_);
2491
2607
  assert(new_log != nullptr);
2492
2608
  if (!logs_.empty()) {
2493
2609
  // Alway flush the buffer of the last log before switching to a new one
@@ -2509,11 +2625,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2509
2625
  }
2510
2626
  }
2511
2627
  if (s.ok()) {
2512
- logfile_number_ = new_log_number;
2513
- log_empty_ = true;
2514
- log_dir_synced_ = false;
2515
- logs_.emplace_back(logfile_number_, new_log);
2516
- alive_log_files_.emplace_back(logfile_number_);
2628
+ cur_wal_number_ = new_log_number;
2629
+ wal_empty_ = true;
2630
+ wal_dir_synced_ = false;
2631
+ logs_.emplace_back(cur_wal_number_, new_log);
2632
+ alive_wal_files_.emplace_back(cur_wal_number_);
2517
2633
  }
2518
2634
  }
2519
2635
 
@@ -2544,7 +2660,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2544
2660
  // obsolete. So we should track the WAL obsoletion event before actually
2545
2661
  // updating the empty CF's log number.
2546
2662
  uint64_t min_wal_number_to_keep =
2547
- versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
2663
+ versions_->PreComputeMinLogNumberWithUnflushedData(cur_wal_number_);
2548
2664
  if (min_wal_number_to_keep >
2549
2665
  versions_->GetWalSet().GetMinWalNumberToKeep()) {
2550
2666
  // TODO: plumb Env::IOActivity, Env::IOPriority
@@ -2579,7 +2695,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2579
2695
 
2580
2696
  for (auto cf : empty_cfs) {
2581
2697
  if (cf->IsEmpty()) {
2582
- cf->SetLogNumber(logfile_number_);
2698
+ cf->SetLogNumber(cur_wal_number_);
2583
2699
  // MEMPURGE: No need to change this, because new adds
2584
2700
  // should still receive new sequence numbers.
2585
2701
  cf->mem()->SetCreationSeq(versions_->LastSequence());
@@ -2596,14 +2712,14 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2596
2712
  // advance the log number. no need to persist this in the manifest
2597
2713
  if (cf->IsEmpty()) {
2598
2714
  if (creating_new_log) {
2599
- cf->SetLogNumber(logfile_number_);
2715
+ cf->SetLogNumber(cur_wal_number_);
2600
2716
  }
2601
2717
  cf->mem()->SetCreationSeq(versions_->LastSequence());
2602
2718
  }
2603
2719
  }
2604
2720
  }
2605
2721
 
2606
- cfd->mem()->SetNextLogNumber(logfile_number_);
2722
+ cfd->mem()->SetNextLogNumber(cur_wal_number_);
2607
2723
  assert(new_mem != nullptr);
2608
2724
  cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
2609
2725
  if (new_imm) {
@@ -2615,13 +2731,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
2615
2731
  // we always try to flush all immutable memtable. For atomic flush, these
2616
2732
  // two memtables will be marked eligible for flush in the same call to
2617
2733
  // AssignAtomicFlushSeq().
2618
- new_imm->SetNextLogNumber(logfile_number_);
2734
+ new_imm->SetNextLogNumber(cur_wal_number_);
2619
2735
  cfd->imm()->Add(new_imm, &context->memtables_to_free_);
2620
2736
  }
2621
2737
  new_mem->Ref();
2622
2738
  cfd->SetMemtable(new_mem);
2623
- InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
2624
- mutable_cf_options);
2739
+ InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context);
2625
2740
 
2626
2741
  // Notify client that memtable is sealed, now that we have successfully
2627
2742
  // installed a new memtable