@nxtedition/rocksdb 5.2.21 → 5.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (923) hide show
  1. package/binding.cc +510 -967
  2. package/binding.gyp +78 -72
  3. package/chained-batch.js +1 -2
  4. package/deps/rocksdb/build_version.cc +70 -4
  5. package/deps/rocksdb/rocksdb/CMakeLists.txt +281 -149
  6. package/deps/rocksdb/rocksdb/Makefile +459 -469
  7. package/deps/rocksdb/rocksdb/TARGETS +5244 -1500
  8. package/deps/rocksdb/rocksdb/cache/cache.cc +12 -3
  9. package/deps/rocksdb/rocksdb/cache/cache_bench.cc +7 -368
  10. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +924 -0
  11. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +128 -0
  12. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.h +103 -0
  13. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +183 -0
  14. package/deps/rocksdb/rocksdb/cache/cache_helpers.h +11 -0
  15. package/deps/rocksdb/rocksdb/cache/cache_key.cc +344 -0
  16. package/deps/rocksdb/rocksdb/cache/cache_key.h +132 -0
  17. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +183 -0
  18. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +288 -0
  19. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +468 -0
  20. package/deps/rocksdb/rocksdb/cache/cache_test.cc +85 -8
  21. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +121 -51
  22. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +171 -0
  23. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +86 -0
  24. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +607 -0
  25. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +381 -154
  26. package/deps/rocksdb/rocksdb/cache/lru_cache.h +176 -33
  27. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1659 -3
  28. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +94 -23
  29. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +49 -28
  30. package/deps/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake +7 -0
  31. package/deps/rocksdb/rocksdb/cmake/modules/FindJeMalloc.cmake +29 -0
  32. package/deps/rocksdb/rocksdb/cmake/modules/FindNUMA.cmake +29 -0
  33. package/deps/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake +29 -0
  34. package/deps/rocksdb/rocksdb/cmake/modules/FindTBB.cmake +33 -0
  35. package/deps/rocksdb/rocksdb/cmake/modules/Findgflags.cmake +29 -0
  36. package/deps/rocksdb/rocksdb/cmake/modules/Findlz4.cmake +29 -0
  37. package/deps/rocksdb/rocksdb/cmake/modules/Finduring.cmake +26 -0
  38. package/deps/rocksdb/rocksdb/cmake/modules/Findzstd.cmake +29 -0
  39. package/deps/rocksdb/rocksdb/cmake/modules/ReadVersion.cmake +10 -0
  40. package/deps/rocksdb/rocksdb/crash_test.mk +93 -0
  41. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +54 -31
  42. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +10 -6
  43. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +146 -0
  44. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc +326 -0
  45. package/deps/rocksdb/rocksdb/db/blob/blob_fetcher.cc +34 -0
  46. package/deps/rocksdb/rocksdb/db/blob/blob_fetcher.h +37 -0
  47. package/deps/rocksdb/rocksdb/db/blob/blob_file_addition.cc +4 -2
  48. package/deps/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc +8 -4
  49. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +99 -40
  50. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +20 -8
  51. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +95 -83
  52. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +13 -10
  53. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +7 -4
  54. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +37 -37
  55. package/deps/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h +101 -0
  56. package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.cc +8 -1
  57. package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +6 -0
  58. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +209 -44
  59. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +37 -11
  60. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +382 -179
  61. package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc +100 -0
  62. package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter.h +102 -0
  63. package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc +196 -0
  64. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +3 -0
  65. package/deps/rocksdb/rocksdb/db/blob/blob_log_format.h +2 -1
  66. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +7 -5
  67. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h +10 -3
  68. package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +12 -8
  69. package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.h +5 -5
  70. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +772 -9
  71. package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +730 -0
  72. package/deps/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc +82 -0
  73. package/deps/rocksdb/rocksdb/db/blob/db_blob_index_test.cc +155 -17
  74. package/deps/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc +21 -0
  75. package/deps/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h +38 -0
  76. package/deps/rocksdb/rocksdb/db/builder.cc +137 -89
  77. package/deps/rocksdb/rocksdb/db/builder.h +16 -37
  78. package/deps/rocksdb/rocksdb/db/c.cc +413 -208
  79. package/deps/rocksdb/rocksdb/db/c_test.c +227 -138
  80. package/deps/rocksdb/rocksdb/db/column_family.cc +118 -103
  81. package/deps/rocksdb/rocksdb/db/column_family.h +86 -44
  82. package/deps/rocksdb/rocksdb/db/column_family_test.cc +38 -24
  83. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +81 -0
  84. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +275 -0
  85. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc +258 -0
  86. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +81 -28
  87. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +43 -12
  88. package/deps/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h +12 -0
  89. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +406 -215
  90. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +147 -50
  91. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +167 -61
  92. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +1321 -156
  93. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +197 -28
  94. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -3
  95. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +246 -43
  96. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +65 -26
  97. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +7 -7
  98. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +122 -9
  99. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -2
  100. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +18 -6
  101. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -1
  102. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +536 -44
  103. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +311 -30
  104. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +1 -1
  105. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +849 -0
  106. package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +92 -0
  107. package/deps/rocksdb/rocksdb/db/compaction/sst_partitioner.cc +46 -0
  108. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/convenience.cc +6 -3
  110. package/deps/rocksdb/rocksdb/db/corruption_test.cc +383 -28
  111. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +7 -2
  112. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +154 -45
  113. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +1095 -33
  114. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +1249 -203
  115. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +135 -9
  116. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +1348 -166
  117. package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +3 -5
  118. package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +1 -1
  119. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +312 -45
  120. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +1734 -48
  121. package/deps/rocksdb/rocksdb/db/{compacted_db_impl.cc → db_impl/compacted_db_impl.cc} +24 -7
  122. package/deps/rocksdb/rocksdb/db/{compacted_db_impl.h → db_impl/compacted_db_impl.h} +1 -1
  123. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +644 -333
  124. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +365 -92
  125. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +578 -210
  126. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +38 -16
  127. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +17 -10
  128. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +75 -74
  129. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +450 -183
  130. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +42 -9
  131. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +232 -15
  132. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +42 -4
  133. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +297 -100
  134. package/deps/rocksdb/rocksdb/db/db_info_dumper.cc +16 -15
  135. package/deps/rocksdb/rocksdb/db/db_inplace_update_test.cc +31 -1
  136. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +6 -5
  137. package/deps/rocksdb/rocksdb/db/db_iter.cc +218 -153
  138. package/deps/rocksdb/rocksdb/db/db_iter.h +14 -12
  139. package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +1 -1
  140. package/deps/rocksdb/rocksdb/db/db_iter_test.cc +84 -160
  141. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +47 -6
  142. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +204 -0
  143. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +21 -13
  144. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +17 -10
  145. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +38 -24
  146. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +184 -19
  147. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +1 -1
  148. package/deps/rocksdb/rocksdb/db/db_options_test.cc +183 -3
  149. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +409 -9
  150. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +92 -23
  151. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +446 -0
  152. package/deps/rocksdb/rocksdb/db/{db_impl/db_secondary_test.cc → db_secondary_test.cc} +363 -35
  153. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +520 -15
  154. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +50 -1
  155. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +139 -4
  156. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +1 -1
  157. package/deps/rocksdb/rocksdb/db/db_test.cc +669 -359
  158. package/deps/rocksdb/rocksdb/db/db_test2.cc +2110 -304
  159. package/deps/rocksdb/rocksdb/db/db_test_util.cc +76 -43
  160. package/deps/rocksdb/rocksdb/db/db_test_util.h +231 -103
  161. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +19 -11
  162. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +490 -71
  163. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +980 -349
  164. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +11 -12
  165. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +793 -0
  166. package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -1
  167. package/deps/rocksdb/rocksdb/db/dbformat.cc +4 -12
  168. package/deps/rocksdb/rocksdb/db/dbformat.h +28 -18
  169. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +3 -0
  170. package/deps/rocksdb/rocksdb/db/deletefile_test.cc +50 -15
  171. package/deps/rocksdb/rocksdb/db/error_handler.cc +127 -41
  172. package/deps/rocksdb/rocksdb/db/error_handler.h +12 -5
  173. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +524 -255
  174. package/deps/rocksdb/rocksdb/db/event_helpers.cc +136 -11
  175. package/deps/rocksdb/rocksdb/db/event_helpers.h +27 -2
  176. package/deps/rocksdb/rocksdb/db/experimental.cc +100 -0
  177. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +307 -4
  178. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +137 -60
  179. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +12 -8
  180. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +86 -55
  181. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +86 -5
  182. package/deps/rocksdb/rocksdb/db/filename_test.cc +63 -0
  183. package/deps/rocksdb/rocksdb/db/flush_job.cc +619 -64
  184. package/deps/rocksdb/rocksdb/db/flush_job.h +30 -7
  185. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +33 -16
  186. package/deps/rocksdb/rocksdb/db/flush_scheduler.h +2 -1
  187. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +18 -17
  188. package/deps/rocksdb/rocksdb/db/forward_iterator.h +5 -4
  189. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +0 -1
  190. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +91 -0
  191. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +25 -14
  192. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +6 -5
  193. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +1 -1
  194. package/deps/rocksdb/rocksdb/db/internal_stats.cc +471 -50
  195. package/deps/rocksdb/rocksdb/db/internal_stats.h +129 -25
  196. package/deps/rocksdb/rocksdb/db/job_context.h +22 -9
  197. package/deps/rocksdb/rocksdb/db/kv_checksum.h +394 -0
  198. package/deps/rocksdb/rocksdb/db/listener_test.cc +518 -41
  199. package/deps/rocksdb/rocksdb/db/log_format.h +4 -1
  200. package/deps/rocksdb/rocksdb/db/log_reader.cc +129 -6
  201. package/deps/rocksdb/rocksdb/db/log_reader.h +17 -1
  202. package/deps/rocksdb/rocksdb/db/log_test.cc +161 -11
  203. package/deps/rocksdb/rocksdb/db/log_writer.cc +92 -13
  204. package/deps/rocksdb/rocksdb/db/log_writer.h +18 -5
  205. package/deps/rocksdb/rocksdb/db/logs_with_prep_tracker.h +1 -1
  206. package/deps/rocksdb/rocksdb/db/lookup_key.h +0 -1
  207. package/deps/rocksdb/rocksdb/db/malloc_stats.cc +2 -2
  208. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +21 -8
  209. package/deps/rocksdb/rocksdb/db/memtable.cc +144 -54
  210. package/deps/rocksdb/rocksdb/db/memtable.h +72 -15
  211. package/deps/rocksdb/rocksdb/db/memtable_list.cc +95 -47
  212. package/deps/rocksdb/rocksdb/db/memtable_list.h +33 -13
  213. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +61 -31
  214. package/deps/rocksdb/rocksdb/db/merge_context.h +20 -8
  215. package/deps/rocksdb/rocksdb/db/merge_helper.cc +54 -11
  216. package/deps/rocksdb/rocksdb/db/merge_helper.h +17 -6
  217. package/deps/rocksdb/rocksdb/db/merge_helper_test.cc +13 -7
  218. package/deps/rocksdb/rocksdb/db/merge_test.cc +40 -19
  219. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +14 -25
  220. package/deps/rocksdb/rocksdb/db/output_validator.cc +3 -0
  221. package/deps/rocksdb/rocksdb/db/output_validator.h +5 -4
  222. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +32 -28
  223. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +43 -29
  224. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +9 -7
  225. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc +21 -16
  226. package/deps/rocksdb/rocksdb/db/pinned_iterators_manager.h +1 -1
  227. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +29 -36
  228. package/deps/rocksdb/rocksdb/db/pre_release_callback.h +1 -2
  229. package/deps/rocksdb/rocksdb/db/prefix_test.cc +4 -4
  230. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +2 -2
  231. package/deps/rocksdb/rocksdb/db/range_del_aggregator_bench.cc +11 -11
  232. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +3 -2
  233. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc +14 -8
  234. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +17 -0
  235. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc +4 -2
  236. package/deps/rocksdb/rocksdb/db/read_callback.h +1 -0
  237. package/deps/rocksdb/rocksdb/db/repair.cc +87 -58
  238. package/deps/rocksdb/rocksdb/db/repair_test.cc +35 -5
  239. package/deps/rocksdb/rocksdb/db/snapshot_impl.h +2 -1
  240. package/deps/rocksdb/rocksdb/db/table_cache.cc +95 -69
  241. package/deps/rocksdb/rocksdb/db/table_cache.h +63 -53
  242. package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +4 -4
  243. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +78 -10
  244. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +28 -33
  245. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +30 -51
  246. package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +12 -8
  247. package/deps/rocksdb/rocksdb/db/version_builder.cc +564 -341
  248. package/deps/rocksdb/rocksdb/db/version_builder.h +8 -8
  249. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +327 -155
  250. package/deps/rocksdb/rocksdb/db/version_edit.cc +89 -27
  251. package/deps/rocksdb/rocksdb/db/version_edit.h +42 -17
  252. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +324 -43
  253. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +79 -22
  254. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +165 -20
  255. package/deps/rocksdb/rocksdb/db/version_set.cc +935 -1034
  256. package/deps/rocksdb/rocksdb/db/version_set.h +183 -122
  257. package/deps/rocksdb/rocksdb/db/version_set_test.cc +556 -138
  258. package/deps/rocksdb/rocksdb/db/version_util.h +68 -0
  259. package/deps/rocksdb/rocksdb/db/wal_manager.cc +23 -21
  260. package/deps/rocksdb/rocksdb/db/wal_manager.h +5 -2
  261. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +30 -27
  262. package/deps/rocksdb/rocksdb/db/write_batch.cc +704 -209
  263. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +135 -2
  264. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +209 -5
  265. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +2 -0
  266. package/deps/rocksdb/rocksdb/db/write_controller.cc +47 -54
  267. package/deps/rocksdb/rocksdb/db/write_controller.h +12 -9
  268. package/deps/rocksdb/rocksdb/db/write_controller_test.cc +215 -103
  269. package/deps/rocksdb/rocksdb/db/write_thread.cc +11 -0
  270. package/deps/rocksdb/rocksdb/db/write_thread.h +14 -8
  271. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +7 -4
  272. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +10 -3
  273. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +6 -0
  274. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress.cc +1 -1
  275. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -2
  276. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +78 -25
  277. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h +13 -2
  278. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +29 -12
  279. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +5 -1
  280. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +199 -32
  281. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc +188 -0
  282. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +59 -10
  283. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +77 -109
  284. package/deps/rocksdb/rocksdb/{third-party/folly/folly/synchronization/WaitOptions.cpp → db_stress_tool/db_stress_stat.cc} +9 -4
  285. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +7 -6
  286. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +1 -0
  287. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +699 -143
  288. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +20 -2
  289. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +49 -39
  290. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +631 -0
  291. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +287 -0
  292. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +1565 -0
  293. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +374 -0
  294. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +149 -18
  295. package/deps/rocksdb/rocksdb/env/composite_env.cc +464 -0
  296. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +98 -646
  297. package/deps/rocksdb/rocksdb/env/emulated_clock.h +114 -0
  298. package/deps/rocksdb/rocksdb/env/env.cc +632 -42
  299. package/deps/rocksdb/rocksdb/env/env_basic_test.cc +84 -36
  300. package/deps/rocksdb/rocksdb/env/env_chroot.cc +88 -286
  301. package/deps/rocksdb/rocksdb/env/env_chroot.h +34 -1
  302. package/deps/rocksdb/rocksdb/env/env_encryption.cc +469 -277
  303. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +9 -30
  304. package/deps/rocksdb/rocksdb/env/env_posix.cc +110 -119
  305. package/deps/rocksdb/rocksdb/env/env_test.cc +1128 -39
  306. package/deps/rocksdb/rocksdb/env/file_system.cc +147 -8
  307. package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +207 -136
  308. package/deps/rocksdb/rocksdb/env/file_system_tracer.h +86 -54
  309. package/deps/rocksdb/rocksdb/env/fs_posix.cc +192 -64
  310. package/deps/rocksdb/rocksdb/env/fs_readonly.h +107 -0
  311. package/deps/rocksdb/rocksdb/env/fs_remap.cc +339 -0
  312. package/deps/rocksdb/rocksdb/env/fs_remap.h +139 -0
  313. package/deps/rocksdb/rocksdb/env/io_posix.cc +245 -41
  314. package/deps/rocksdb/rocksdb/env/io_posix.h +66 -1
  315. package/deps/rocksdb/rocksdb/env/mock_env.cc +147 -149
  316. package/deps/rocksdb/rocksdb/env/mock_env.h +113 -11
  317. package/deps/rocksdb/rocksdb/env/mock_env_test.cc +2 -4
  318. package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +164 -0
  319. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +71 -0
  320. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +9 -5
  321. package/deps/rocksdb/rocksdb/file/delete_scheduler.h +6 -4
  322. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +19 -12
  323. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +459 -70
  324. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +205 -28
  325. package/deps/rocksdb/rocksdb/file/file_util.cc +39 -28
  326. package/deps/rocksdb/rocksdb/file/file_util.h +18 -27
  327. package/deps/rocksdb/rocksdb/file/filename.cc +59 -22
  328. package/deps/rocksdb/rocksdb/file/filename.h +13 -8
  329. package/deps/rocksdb/rocksdb/file/line_file_reader.cc +68 -0
  330. package/deps/rocksdb/rocksdb/file/line_file_reader.h +59 -0
  331. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +1130 -6
  332. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +220 -36
  333. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +69 -17
  334. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +13 -12
  335. package/deps/rocksdb/rocksdb/file/read_write_util.cc +3 -38
  336. package/deps/rocksdb/rocksdb/file/read_write_util.h +0 -4
  337. package/deps/rocksdb/rocksdb/file/readahead_file_info.h +33 -0
  338. package/deps/rocksdb/rocksdb/file/sequence_file_reader.cc +57 -9
  339. package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +58 -6
  340. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +29 -54
  341. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +22 -29
  342. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +424 -50
  343. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +66 -19
  344. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +157 -66
  345. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +224 -121
  346. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +333 -30
  347. package/deps/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h +14 -0
  348. package/deps/rocksdb/rocksdb/include/rocksdb/cleanable.h +1 -1
  349. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +90 -50
  350. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +13 -5
  351. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +20 -4
  352. package/deps/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h +8 -3
  353. package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +53 -12
  354. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +31 -6
  355. package/deps/rocksdb/rocksdb/include/rocksdb/customizable.h +102 -7
  356. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +51 -0
  357. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +370 -262
  358. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +286 -87
  359. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +124 -64
  360. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +27 -0
  361. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +21 -4
  362. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +384 -41
  363. package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +111 -143
  364. package/deps/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h +20 -6
  365. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +56 -0
  366. package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +15 -33
  367. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +37 -1
  368. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -3
  369. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +314 -26
  370. package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +11 -7
  371. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +50 -15
  372. package/deps/rocksdb/rocksdb/include/rocksdb/merge_operator.h +10 -3
  373. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +186 -96
  374. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +373 -103
  375. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +13 -3
  376. package/deps/rocksdb/rocksdb/include/rocksdb/persistent_cache.h +2 -2
  377. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +37 -7
  378. package/deps/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h +6 -0
  379. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +87 -0
  380. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +5 -12
  381. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +59 -30
  382. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +11 -11
  383. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +22 -0
  384. package/deps/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h +17 -10
  385. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +121 -41
  386. package/deps/rocksdb/rocksdb/include/rocksdb/stats_history.h +1 -0
  387. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +114 -136
  388. package/deps/rocksdb/rocksdb/include/rocksdb/system_clock.h +116 -0
  389. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +160 -18
  390. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +57 -15
  391. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +3 -1
  392. package/deps/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h +10 -6
  393. package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +247 -0
  394. package/deps/rocksdb/rocksdb/include/rocksdb/trace_record_result.h +187 -0
  395. package/deps/rocksdb/rocksdb/include/rocksdb/transaction_log.h +1 -1
  396. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +14 -24
  397. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +46 -0
  398. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +14 -4
  399. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/agg_merge.h +138 -0
  400. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +631 -0
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +142 -0
  402. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h +12 -9
  403. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h +368 -0
  404. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +24 -0
  405. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h +4 -0
  406. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +418 -63
  407. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +143 -73
  408. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +2 -2
  409. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h +87 -0
  410. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h +2 -2
  411. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +43 -5
  412. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +18 -23
  413. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +26 -0
  414. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +32 -6
  415. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +1 -2
  416. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +20 -1
  417. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +30 -3
  418. package/deps/rocksdb/rocksdb/include/rocksdb/wal_filter.h +11 -2
  419. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +89 -11
  420. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch_base.h +11 -0
  421. package/deps/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h +108 -38
  422. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +40 -23
  423. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.h +12 -5
  424. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +100 -49
  425. package/deps/rocksdb/rocksdb/logging/env_logger.h +7 -5
  426. package/deps/rocksdb/rocksdb/logging/env_logger_test.cc +0 -1
  427. package/deps/rocksdb/rocksdb/logging/posix_logger.h +3 -9
  428. package/deps/rocksdb/rocksdb/memory/arena.cc +3 -1
  429. package/deps/rocksdb/rocksdb/memory/arena.h +1 -1
  430. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +171 -106
  431. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +31 -15
  432. package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc +15 -4
  433. package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator.h +24 -8
  434. package/deps/rocksdb/rocksdb/memory/memory_allocator.cc +91 -0
  435. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +239 -0
  436. package/deps/rocksdb/rocksdb/memory/memory_usage.h +14 -1
  437. package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +72 -9
  438. package/deps/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc +52 -6
  439. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +53 -0
  440. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +5 -5
  441. package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +17 -5
  442. package/deps/rocksdb/rocksdb/memtable/skiplist_test.cc +1 -1
  443. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +87 -0
  444. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +20 -10
  445. package/deps/rocksdb/rocksdb/memtable/write_buffer_manager.cc +148 -94
  446. package/deps/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc +160 -62
  447. package/deps/rocksdb/rocksdb/microbench/CMakeLists.txt +17 -0
  448. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +1360 -0
  449. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +153 -0
  450. package/deps/rocksdb/rocksdb/monitoring/histogram.cc +8 -15
  451. package/deps/rocksdb/rocksdb/monitoring/histogram.h +0 -1
  452. package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +18 -16
  453. package/deps/rocksdb/rocksdb/monitoring/histogram_windowing.cc +9 -7
  454. package/deps/rocksdb/rocksdb/monitoring/histogram_windowing.h +5 -3
  455. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.cc +7 -5
  456. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +37 -12
  457. package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +26 -6
  458. package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +6 -10
  459. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +14 -13
  460. package/deps/rocksdb/rocksdb/monitoring/perf_context_imp.h +19 -20
  461. package/deps/rocksdb/rocksdb/monitoring/perf_step_timer.h +18 -18
  462. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +84 -2
  463. package/deps/rocksdb/rocksdb/monitoring/statistics.h +6 -0
  464. package/deps/rocksdb/rocksdb/monitoring/statistics_test.cc +47 -2
  465. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +67 -54
  466. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +4 -1
  467. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +2 -1
  468. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -2
  469. package/deps/rocksdb/rocksdb/options/cf_options.cc +280 -212
  470. package/deps/rocksdb/rocksdb/options/cf_options.h +51 -57
  471. package/deps/rocksdb/rocksdb/options/configurable.cc +242 -138
  472. package/deps/rocksdb/rocksdb/options/configurable_helper.h +4 -68
  473. package/deps/rocksdb/rocksdb/options/configurable_test.cc +144 -21
  474. package/deps/rocksdb/rocksdb/options/configurable_test.h +2 -3
  475. package/deps/rocksdb/rocksdb/options/customizable.cc +67 -7
  476. package/deps/rocksdb/rocksdb/options/customizable_test.cc +1773 -151
  477. package/deps/rocksdb/rocksdb/options/db_options.cc +275 -47
  478. package/deps/rocksdb/rocksdb/options/db_options.h +36 -7
  479. package/deps/rocksdb/rocksdb/options/options.cc +49 -17
  480. package/deps/rocksdb/rocksdb/options/options_helper.cc +369 -352
  481. package/deps/rocksdb/rocksdb/options/options_helper.h +23 -23
  482. package/deps/rocksdb/rocksdb/options/options_parser.cc +18 -13
  483. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +67 -54
  484. package/deps/rocksdb/rocksdb/options/options_test.cc +1162 -187
  485. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +1 -1
  486. package/deps/rocksdb/rocksdb/port/lang.h +52 -0
  487. package/deps/rocksdb/rocksdb/port/port_example.h +1 -1
  488. package/deps/rocksdb/rocksdb/port/port_posix.cc +31 -2
  489. package/deps/rocksdb/rocksdb/port/port_posix.h +20 -2
  490. package/deps/rocksdb/rocksdb/port/stack_trace.cc +20 -4
  491. package/deps/rocksdb/rocksdb/port/sys_time.h +2 -2
  492. package/deps/rocksdb/rocksdb/port/win/env_default.cc +7 -7
  493. package/deps/rocksdb/rocksdb/port/win/env_win.cc +44 -74
  494. package/deps/rocksdb/rocksdb/port/win/env_win.h +25 -23
  495. package/deps/rocksdb/rocksdb/port/win/io_win.cc +32 -34
  496. package/deps/rocksdb/rocksdb/port/win/io_win.h +12 -6
  497. package/deps/rocksdb/rocksdb/port/win/port_win.cc +55 -35
  498. package/deps/rocksdb/rocksdb/port/win/port_win.h +22 -5
  499. package/deps/rocksdb/rocksdb/port/win/win_logger.cc +3 -3
  500. package/deps/rocksdb/rocksdb/port/win/win_logger.h +3 -5
  501. package/deps/rocksdb/rocksdb/port/win/win_thread.cc +7 -1
  502. package/deps/rocksdb/rocksdb/port/win/win_thread.h +12 -17
  503. package/deps/rocksdb/rocksdb/python.mk +9 -0
  504. package/deps/rocksdb/rocksdb/src.mk +82 -34
  505. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -4
  506. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +1 -1
  507. package/deps/rocksdb/rocksdb/table/block_based/block.cc +158 -80
  508. package/deps/rocksdb/rocksdb/table/block_based/block.h +64 -36
  509. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc +23 -14
  510. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.h +13 -5
  511. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc +3 -218
  512. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +603 -328
  513. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +28 -22
  514. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +220 -82
  515. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +8 -2
  516. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +3 -4
  517. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +28 -4
  518. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +598 -492
  519. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +151 -96
  520. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +31 -58
  521. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +330 -92
  522. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +50 -19
  523. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +23 -0
  524. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +226 -0
  525. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +56 -22
  526. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +42 -4
  527. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +5 -2
  528. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +2 -0
  529. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +34 -20
  530. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +9 -10
  531. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +26 -3
  532. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +2 -1
  533. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +844 -202
  534. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +281 -81
  535. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +62 -2
  536. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.h +2 -3
  537. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +28 -7
  538. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +22 -6
  539. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +28 -26
  540. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
  541. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +1 -2
  542. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +2 -1
  543. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +11 -4
  544. package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.cc +2 -1
  545. package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h +2 -0
  546. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +68 -26
  547. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +44 -9
  548. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +12 -10
  549. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +3 -4
  550. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h +23 -4
  551. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +44 -19
  552. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +5 -1
  553. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +16 -28
  554. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +7 -4
  555. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +2 -2
  556. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +77 -57
  557. package/deps/rocksdb/rocksdb/table/block_fetcher.h +23 -12
  558. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +43 -56
  559. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +8 -8
  560. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +2 -1
  561. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +52 -70
  562. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc +5 -8
  563. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +1 -1
  564. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +17 -11
  565. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h +2 -3
  566. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +42 -51
  567. package/deps/rocksdb/rocksdb/table/format.cc +258 -104
  568. package/deps/rocksdb/rocksdb/table/format.h +120 -109
  569. package/deps/rocksdb/rocksdb/table/get_context.cc +97 -65
  570. package/deps/rocksdb/rocksdb/table/get_context.h +19 -12
  571. package/deps/rocksdb/rocksdb/table/internal_iterator.h +14 -0
  572. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
  573. package/deps/rocksdb/rocksdb/table/merger_test.cc +3 -2
  574. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +11 -21
  575. package/deps/rocksdb/rocksdb/table/merging_iterator.h +3 -3
  576. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +176 -171
  577. package/deps/rocksdb/rocksdb/table/meta_blocks.h +47 -33
  578. package/deps/rocksdb/rocksdb/table/mock_table.cc +7 -9
  579. package/deps/rocksdb/rocksdb/table/mock_table.h +3 -2
  580. package/deps/rocksdb/rocksdb/table/multiget_context.h +15 -8
  581. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.cc +22 -29
  582. package/deps/rocksdb/rocksdb/table/persistent_cache_options.h +6 -3
  583. package/deps/rocksdb/rocksdb/table/plain/plain_table_bloom.h +5 -8
  584. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +29 -26
  585. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +12 -16
  586. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.cc +145 -69
  587. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +1 -1
  588. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +7 -6
  589. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +3 -4
  590. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +3 -1
  591. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +1 -1
  592. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +13 -18
  593. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.h +4 -9
  594. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +55 -37
  595. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +10 -5
  596. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +11 -8
  597. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +222 -16
  598. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +106 -58
  599. package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +6 -5
  600. package/deps/rocksdb/rocksdb/table/table_builder.h +68 -44
  601. package/deps/rocksdb/rocksdb/table/table_factory.cc +37 -10
  602. package/deps/rocksdb/rocksdb/table/table_properties.cc +109 -54
  603. package/deps/rocksdb/rocksdb/table/table_properties_internal.h +4 -20
  604. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +33 -32
  605. package/deps/rocksdb/rocksdb/table/table_reader_caller.h +2 -0
  606. package/deps/rocksdb/rocksdb/table/table_test.cc +989 -326
  607. package/deps/rocksdb/rocksdb/table/two_level_iterator.cc +4 -0
  608. package/deps/rocksdb/rocksdb/table/unique_id.cc +166 -0
  609. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +59 -0
  610. package/deps/rocksdb/rocksdb/test_util/mock_time_env.cc +1 -1
  611. package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +13 -10
  612. package/deps/rocksdb/rocksdb/test_util/sync_point.cc +1 -2
  613. package/deps/rocksdb/rocksdb/test_util/sync_point.h +35 -16
  614. package/deps/rocksdb/rocksdb/test_util/sync_point_impl.cc +32 -10
  615. package/deps/rocksdb/rocksdb/test_util/sync_point_impl.h +31 -4
  616. package/deps/rocksdb/rocksdb/test_util/testharness.cc +53 -1
  617. package/deps/rocksdb/rocksdb/test_util/testharness.h +67 -3
  618. package/deps/rocksdb/rocksdb/test_util/testutil.cc +236 -66
  619. package/deps/rocksdb/rocksdb/test_util/testutil.h +63 -100
  620. package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +12 -1
  621. package/deps/rocksdb/rocksdb/tools/blob_dump.cc +2 -2
  622. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +6 -3
  623. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h +1 -0
  624. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +9 -3
  625. package/deps/rocksdb/rocksdb/tools/db_bench.cc +1 -1
  626. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +1420 -611
  627. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +11 -8
  628. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +11 -1
  629. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +4 -2
  630. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc +46 -22
  631. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +655 -179
  632. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +58 -6
  633. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +472 -29
  634. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +23 -2
  635. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +2 -2
  636. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +246 -0
  637. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +126 -0
  638. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +83 -29
  639. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +38 -17
  640. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +191 -55
  641. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +219 -296
  642. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +87 -53
  643. package/deps/rocksdb/rocksdb/tools/write_stress.cc +8 -7
  644. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc +6 -5
  645. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +5 -4
  646. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc +14 -9
  647. package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +134 -60
  648. package/deps/rocksdb/rocksdb/trace_replay/io_tracer.h +49 -38
  649. package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +152 -15
  650. package/deps/rocksdb/rocksdb/trace_replay/trace_record.cc +206 -0
  651. package/deps/rocksdb/rocksdb/trace_replay/trace_record_handler.cc +190 -0
  652. package/deps/rocksdb/rocksdb/trace_replay/trace_record_handler.h +46 -0
  653. package/deps/rocksdb/rocksdb/trace_replay/trace_record_result.cc +146 -0
  654. package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +475 -344
  655. package/deps/rocksdb/rocksdb/trace_replay/trace_replay.h +83 -95
  656. package/deps/rocksdb/rocksdb/util/autovector.h +38 -18
  657. package/deps/rocksdb/rocksdb/util/autovector_test.cc +1 -1
  658. package/deps/rocksdb/rocksdb/util/bloom_impl.h +4 -0
  659. package/deps/rocksdb/rocksdb/util/bloom_test.cc +276 -94
  660. package/deps/rocksdb/rocksdb/util/build_version.cc.in +81 -4
  661. package/deps/rocksdb/rocksdb/util/cast_util.h +22 -0
  662. package/deps/rocksdb/rocksdb/util/channel.h +2 -0
  663. package/deps/rocksdb/rocksdb/util/coding.h +1 -33
  664. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +8 -0
  665. package/deps/rocksdb/rocksdb/util/comparator.cc +163 -3
  666. package/deps/rocksdb/rocksdb/util/compression.cc +122 -0
  667. package/deps/rocksdb/rocksdb/util/compression.h +212 -7
  668. package/deps/rocksdb/rocksdb/util/compression_context_cache.cc +1 -3
  669. package/deps/rocksdb/rocksdb/util/crc32c.cc +165 -2
  670. package/deps/rocksdb/rocksdb/util/crc32c.h +6 -0
  671. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +14 -0
  672. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +3 -0
  673. package/deps/rocksdb/rocksdb/util/crc32c_test.cc +47 -0
  674. package/deps/rocksdb/rocksdb/util/defer.h +30 -1
  675. package/deps/rocksdb/rocksdb/util/defer_test.cc +11 -0
  676. package/deps/rocksdb/rocksdb/util/duplicate_detector.h +3 -1
  677. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +3 -3
  678. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +5 -4
  679. package/deps/rocksdb/rocksdb/util/fastrange.h +2 -0
  680. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +36 -0
  681. package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +3 -1
  682. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +512 -52
  683. package/deps/rocksdb/rocksdb/util/filter_bench.cc +65 -10
  684. package/deps/rocksdb/rocksdb/util/gflags_compat.h +6 -1
  685. package/deps/rocksdb/rocksdb/util/hash.cc +121 -3
  686. package/deps/rocksdb/rocksdb/util/hash.h +31 -1
  687. package/deps/rocksdb/rocksdb/util/hash128.h +26 -0
  688. package/deps/rocksdb/rocksdb/util/hash_containers.h +51 -0
  689. package/deps/rocksdb/rocksdb/util/hash_test.cc +194 -2
  690. package/deps/rocksdb/rocksdb/util/heap.h +6 -1
  691. package/deps/rocksdb/rocksdb/util/kv_map.h +1 -1
  692. package/deps/rocksdb/rocksdb/util/log_write_bench.cc +8 -6
  693. package/deps/rocksdb/rocksdb/util/math.h +74 -7
  694. package/deps/rocksdb/rocksdb/util/math128.h +13 -1
  695. package/deps/rocksdb/rocksdb/util/murmurhash.h +3 -3
  696. package/deps/rocksdb/rocksdb/util/random.cc +9 -0
  697. package/deps/rocksdb/rocksdb/util/random.h +6 -0
  698. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +298 -144
  699. package/deps/rocksdb/rocksdb/util/rate_limiter.h +68 -19
  700. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +335 -23
  701. package/deps/rocksdb/rocksdb/util/repeatable_thread.h +10 -12
  702. package/deps/rocksdb/rocksdb/util/repeatable_thread_test.cc +18 -15
  703. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +98 -74
  704. package/deps/rocksdb/rocksdb/util/ribbon_config.cc +506 -0
  705. package/deps/rocksdb/rocksdb/util/ribbon_config.h +182 -0
  706. package/deps/rocksdb/rocksdb/util/ribbon_impl.h +154 -79
  707. package/deps/rocksdb/rocksdb/util/ribbon_test.cc +742 -365
  708. package/deps/rocksdb/rocksdb/util/set_comparator.h +2 -0
  709. package/deps/rocksdb/rocksdb/util/slice.cc +198 -35
  710. package/deps/rocksdb/rocksdb/util/slice_test.cc +30 -1
  711. package/deps/rocksdb/rocksdb/util/status.cc +32 -29
  712. package/deps/rocksdb/rocksdb/util/stop_watch.h +18 -18
  713. package/deps/rocksdb/rocksdb/util/string_util.cc +85 -6
  714. package/deps/rocksdb/rocksdb/util/string_util.h +47 -2
  715. package/deps/rocksdb/rocksdb/util/thread_guard.h +41 -0
  716. package/deps/rocksdb/rocksdb/util/thread_local.h +2 -2
  717. package/deps/rocksdb/rocksdb/util/thread_local_test.cc +22 -24
  718. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +7 -6
  719. package/deps/rocksdb/rocksdb/util/timer.h +55 -46
  720. package/deps/rocksdb/rocksdb/util/timer_test.cc +50 -48
  721. package/deps/rocksdb/rocksdb/util/user_comparator_wrapper.h +4 -0
  722. package/deps/rocksdb/rocksdb/util/vector_iterator.h +31 -15
  723. package/deps/rocksdb/rocksdb/util/work_queue.h +2 -0
  724. package/deps/rocksdb/rocksdb/util/xxhash.cc +35 -1144
  725. package/deps/rocksdb/rocksdb/util/xxhash.h +5117 -373
  726. package/deps/rocksdb/rocksdb/util/xxph3.h +1762 -0
  727. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.cc +238 -0
  728. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.h +49 -0
  729. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge_test.cc +134 -0
  730. package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.cc +104 -0
  731. package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.h +47 -0
  732. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +3164 -0
  733. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_impl.h +29 -0
  734. package/deps/rocksdb/rocksdb/utilities/{backupable/backupable_db_test.cc → backup/backup_engine_test.cc} +1679 -485
  735. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +6 -4
  736. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +14 -9
  737. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +2 -0
  738. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +1 -0
  739. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +4 -0
  740. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +37 -27
  741. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +8 -4
  742. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +1 -1
  743. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h +13 -10
  744. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +5 -0
  745. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +44 -25
  746. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +3 -4
  747. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +27 -19
  748. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +4 -2
  749. package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +69 -0
  750. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +489 -0
  751. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +366 -0
  752. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc +67 -4
  753. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h +21 -6
  754. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +107 -7
  755. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h +43 -0
  756. package/deps/rocksdb/rocksdb/utilities/cassandra/format.h +1 -1
  757. package/deps/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc +24 -8
  758. package/deps/rocksdb/rocksdb/utilities/cassandra/merge_operator.h +7 -7
  759. package/deps/rocksdb/rocksdb/utilities/cassandra/serialize.h +5 -0
  760. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +99 -218
  761. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h +8 -24
  762. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +114 -1
  763. package/deps/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h +6 -2
  764. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -4
  765. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +7 -6
  766. package/deps/rocksdb/rocksdb/utilities/compaction_filters.cc +56 -0
  767. package/deps/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc +2 -2
  768. package/deps/rocksdb/rocksdb/utilities/counted_fs.cc +355 -0
  769. package/deps/rocksdb/rocksdb/utilities/counted_fs.h +152 -0
  770. package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +13 -0
  771. package/deps/rocksdb/rocksdb/utilities/env_timed.cc +164 -122
  772. package/deps/rocksdb/rocksdb/utilities/env_timed.h +97 -0
  773. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.cc +75 -17
  774. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +19 -3
  775. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +539 -126
  776. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +162 -17
  777. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +110 -0
  778. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +94 -0
  779. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +5 -2
  780. package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +104 -0
  781. package/deps/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h +5 -3
  782. package/deps/rocksdb/rocksdb/utilities/merge_operators/max.cc +4 -1
  783. package/deps/rocksdb/rocksdb/utilities/merge_operators/put.cc +11 -3
  784. package/deps/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc +0 -2
  785. package/deps/rocksdb/rocksdb/utilities/merge_operators/sortlist.h +5 -1
  786. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc +29 -10
  787. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h +6 -3
  788. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc +29 -14
  789. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h +6 -3
  790. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +71 -18
  791. package/deps/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc +15 -9
  792. package/deps/rocksdb/rocksdb/utilities/merge_operators.cc +120 -0
  793. package/deps/rocksdb/rocksdb/utilities/merge_operators.h +3 -23
  794. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +267 -42
  795. package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +702 -76
  796. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +1 -1
  797. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +26 -5
  798. package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +1 -1
  799. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +124 -1
  800. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +2 -3
  801. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +8 -9
  802. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +15 -13
  803. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +1 -1
  804. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +4 -4
  805. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +2 -2
  806. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc +8 -9
  807. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc +1 -1
  808. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +6 -3
  809. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +2 -2
  810. package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc +3 -0
  811. package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc +2 -0
  812. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +43 -35
  813. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +20 -18
  814. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +107 -2
  815. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +23 -15
  816. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h +2 -2
  817. package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +316 -0
  818. package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.h +86 -0
  819. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +4 -5
  820. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +4 -3
  821. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
  822. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +119 -3
  823. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc +20 -3
  824. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h +20 -0
  825. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h +3 -2
  826. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +4 -0
  827. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +38 -14
  828. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h +17 -10
  829. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +1 -0
  830. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +1 -2
  831. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +423 -34
  832. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +82 -2
  833. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +72 -40
  834. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +32 -1
  835. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +13 -5
  836. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +7 -3
  837. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +207 -43
  838. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +50 -7
  839. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +28 -10
  840. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +11 -6
  841. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +516 -0
  842. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +506 -15
  843. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +27 -13
  844. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +14 -14
  845. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +3 -0
  846. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +2 -2
  847. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +14 -5
  848. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +305 -27
  849. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +55 -159
  850. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +209 -2
  851. package/deps/rocksdb/rocksdb/utilities/wal_filter.cc +23 -0
  852. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +157 -88
  853. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +501 -114
  854. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +91 -316
  855. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +1212 -672
  856. package/deps/rocksdb/rocksdb.gyp +425 -446
  857. package/index.js +5 -87
  858. package/package-lock.json +23687 -0
  859. package/package.json +8 -9
  860. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  861. package/prebuilds/darwin-x64/node.napi.node +0 -0
  862. package/prebuilds/{darwin-x64+arm64 → linux-x64}/node.napi.node +0 -0
  863. package/deps/rocksdb/rocksdb/README.md +0 -32
  864. package/deps/rocksdb/rocksdb/env/env_hdfs.cc +0 -648
  865. package/deps/rocksdb/rocksdb/hdfs/README +0 -23
  866. package/deps/rocksdb/rocksdb/hdfs/env_hdfs.h +0 -386
  867. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h +0 -535
  868. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h +0 -175
  869. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/utility_db.h +0 -34
  870. package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator_test.cc +0 -102
  871. package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.h +0 -49
  872. package/deps/rocksdb/rocksdb/memtable/hash_skiplist_rep.h +0 -44
  873. package/deps/rocksdb/rocksdb/options/customizable_helper.h +0 -216
  874. package/deps/rocksdb/rocksdb/port/README +0 -10
  875. package/deps/rocksdb/rocksdb/third-party/folly/folly/CPortability.h +0 -27
  876. package/deps/rocksdb/rocksdb/third-party/folly/folly/ConstexprMath.h +0 -45
  877. package/deps/rocksdb/rocksdb/third-party/folly/folly/Indestructible.h +0 -166
  878. package/deps/rocksdb/rocksdb/third-party/folly/folly/Optional.h +0 -570
  879. package/deps/rocksdb/rocksdb/third-party/folly/folly/Portability.h +0 -92
  880. package/deps/rocksdb/rocksdb/third-party/folly/folly/ScopeGuard.h +0 -54
  881. package/deps/rocksdb/rocksdb/third-party/folly/folly/Traits.h +0 -152
  882. package/deps/rocksdb/rocksdb/third-party/folly/folly/Unit.h +0 -59
  883. package/deps/rocksdb/rocksdb/third-party/folly/folly/Utility.h +0 -141
  884. package/deps/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h +0 -33
  885. package/deps/rocksdb/rocksdb/third-party/folly/folly/container/Array.h +0 -74
  886. package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex-inl.h +0 -117
  887. package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp +0 -263
  888. package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.h +0 -96
  889. package/deps/rocksdb/rocksdb/third-party/folly/folly/functional/Invoke.h +0 -40
  890. package/deps/rocksdb/rocksdb/third-party/folly/folly/hash/Hash.h +0 -29
  891. package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h +0 -144
  892. package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Bits.h +0 -30
  893. package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Launder.h +0 -51
  894. package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/Asm.h +0 -28
  895. package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/SysSyscall.h +0 -10
  896. package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/SysTypes.h +0 -26
  897. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification-inl.h +0 -138
  898. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification.cpp +0 -23
  899. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification.h +0 -57
  900. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicUtil-inl.h +0 -260
  901. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicUtil.h +0 -52
  902. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h +0 -328
  903. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h +0 -1703
  904. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex.cpp +0 -16
  905. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex.h +0 -304
  906. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h +0 -39
  907. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/ParkingLot.cpp +0 -26
  908. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/ParkingLot.h +0 -318
  909. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/WaitOptions.h +0 -57
  910. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h +0 -219
  911. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h +0 -207
  912. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/ProxyLockable.h +0 -164
  913. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/Sleeper.h +0 -57
  914. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/Spin.h +0 -77
  915. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp +0 -1145
  916. package/deps/rocksdb/rocksdb/util/build_version.h +0 -15
  917. package/deps/rocksdb/rocksdb/util/xxh3p.h +0 -1392
  918. package/deps/rocksdb/rocksdb/utilities/backupable/backupable_db.cc +0 -2354
  919. package/deps/rocksdb/rocksdb/utilities/env_librados.cc +0 -1497
  920. package/deps/rocksdb/rocksdb/utilities/env_librados_test.cc +0 -1146
  921. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README +0 -13
  922. package/deps/snappy/snappy-1.1.7/README.md +0 -149
  923. package/prebuilds/linux-x64/node.napi.glibc.node +0 -0
@@ -9,11 +9,10 @@
9
9
 
10
10
  #include "db/version_set.h"
11
11
 
12
- #include <stdio.h>
13
-
14
12
  #include <algorithm>
15
13
  #include <array>
16
14
  #include <cinttypes>
15
+ #include <cstdio>
17
16
  #include <list>
18
17
  #include <map>
19
18
  #include <set>
@@ -21,10 +20,14 @@
21
20
  #include <unordered_map>
22
21
  #include <vector>
23
22
 
24
- #include "compaction/compaction.h"
23
+ #include "db/blob/blob_fetcher.h"
25
24
  #include "db/blob/blob_file_cache.h"
26
25
  #include "db/blob/blob_file_reader.h"
27
26
  #include "db/blob/blob_index.h"
27
+ #include "db/blob/blob_log_format.h"
28
+ #include "db/compaction/compaction.h"
29
+ #include "db/compaction/file_pri.h"
30
+ #include "db/dbformat.h"
28
31
  #include "db/internal_stats.h"
29
32
  #include "db/log_reader.h"
30
33
  #include "db/log_writer.h"
@@ -39,9 +42,11 @@
39
42
  #include "file/random_access_file_reader.h"
40
43
  #include "file/read_write_util.h"
41
44
  #include "file/writable_file_writer.h"
45
+ #include "logging/logging.h"
42
46
  #include "monitoring/file_read_sample.h"
43
47
  #include "monitoring/perf_context_imp.h"
44
48
  #include "monitoring/persistent_stats_history.h"
49
+ #include "options/options_helper.h"
45
50
  #include "rocksdb/env.h"
46
51
  #include "rocksdb/merge_operator.h"
47
52
  #include "rocksdb/write_buffer_manager.h"
@@ -116,10 +121,9 @@ Status OverlapWithIterator(const Comparator* ucmp,
116
121
  // are MergeInProgress).
117
122
  class FilePicker {
118
123
  public:
119
- FilePicker(std::vector<FileMetaData*>* files, const Slice& user_key,
120
- const Slice& ikey, autovector<LevelFilesBrief>* file_levels,
121
- unsigned int num_levels, FileIndexer* file_indexer,
122
- const Comparator* user_comparator,
124
+ FilePicker(const Slice& user_key, const Slice& ikey,
125
+ autovector<LevelFilesBrief>* file_levels, unsigned int num_levels,
126
+ FileIndexer* file_indexer, const Comparator* user_comparator,
123
127
  const InternalKeyComparator* internal_comparator)
124
128
  : num_levels_(num_levels),
125
129
  curr_level_(static_cast<unsigned int>(-1)),
@@ -127,9 +131,6 @@ class FilePicker {
127
131
  hit_file_level_(static_cast<unsigned int>(-1)),
128
132
  search_left_bound_(0),
129
133
  search_right_bound_(FileIndexer::kLevelMaxIndex),
130
- #ifndef NDEBUG
131
- files_(files),
132
- #endif
133
134
  level_files_brief_(file_levels),
134
135
  is_hit_file_last_in_level_(false),
135
136
  curr_file_level_(nullptr),
@@ -138,9 +139,6 @@ class FilePicker {
138
139
  file_indexer_(file_indexer),
139
140
  user_comparator_(user_comparator),
140
141
  internal_comparator_(internal_comparator) {
141
- #ifdef NDEBUG
142
- (void)files;
143
- #endif
144
142
  // Setup member variables to search first level.
145
143
  search_ended_ = !PrepareNextLevel();
146
144
  if (!search_ended_) {
@@ -210,23 +208,7 @@ class FilePicker {
210
208
  }
211
209
  }
212
210
  }
213
- #ifndef NDEBUG
214
- // Sanity check to make sure that the files are correctly sorted
215
- if (prev_file_) {
216
- if (curr_level_ != 0) {
217
- int comp_sign = internal_comparator_->Compare(
218
- prev_file_->largest_key, f->smallest_key);
219
- assert(comp_sign < 0);
220
- } else {
221
- // level == 0, the current file cannot be newer than the previous
222
- // one. Use compressed data structure, has no attribute seqNo
223
- assert(curr_index_in_curr_level_ > 0);
224
- assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_],
225
- files_[0][curr_index_in_curr_level_-1]));
226
- }
227
- }
228
- prev_file_ = f;
229
- #endif
211
+
230
212
  returned_file_level_ = curr_level_;
231
213
  if (curr_level_ > 0 && cmp_largest < 0) {
232
214
  // No more files to search in this level.
@@ -258,9 +240,6 @@ class FilePicker {
258
240
  unsigned int hit_file_level_;
259
241
  int32_t search_left_bound_;
260
242
  int32_t search_right_bound_;
261
- #ifndef NDEBUG
262
- std::vector<FileMetaData*>* files_;
263
- #endif
264
243
  autovector<LevelFilesBrief>* level_files_brief_;
265
244
  bool search_ended_;
266
245
  bool is_hit_file_last_in_level_;
@@ -272,9 +251,6 @@ class FilePicker {
272
251
  FileIndexer* file_indexer_;
273
252
  const Comparator* user_comparator_;
274
253
  const InternalKeyComparator* internal_comparator_;
275
- #ifndef NDEBUG
276
- FdWithKeyRange* prev_file_;
277
- #endif
278
254
 
279
255
  // Setup local variables to search next level.
280
256
  // Returns false if there are no more levels to search.
@@ -344,9 +320,7 @@ class FilePicker {
344
320
  }
345
321
  start_index_in_curr_level_ = start_index;
346
322
  curr_index_in_curr_level_ = start_index;
347
- #ifndef NDEBUG
348
- prev_file_ = nullptr;
349
- #endif
323
+
350
324
  return true;
351
325
  }
352
326
  // curr_level_ = num_levels_. So, no more levels to search.
@@ -408,7 +382,7 @@ class FilePickerMultiGet {
408
382
  int GetCurrentLevel() const { return curr_level_; }
409
383
 
410
384
  // Iterates through files in the current level until it finds a file that
411
- // contains atleast one key from the MultiGet batch
385
+ // contains at least one key from the MultiGet batch
412
386
  bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
413
387
  size_t* file_index, FdWithKeyRange** fd,
414
388
  bool* is_last_key_in_file) {
@@ -885,9 +859,10 @@ class LevelIterator final : public InternalIterator {
885
859
  const FileOptions& file_options,
886
860
  const InternalKeyComparator& icomparator,
887
861
  const LevelFilesBrief* flevel,
888
- const SliceTransform* prefix_extractor, bool should_sample,
889
- HistogramImpl* file_read_hist, TableReaderCaller caller,
890
- bool skip_filters, int level, RangeDelAggregator* range_del_agg,
862
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
863
+ bool should_sample, HistogramImpl* file_read_hist,
864
+ TableReaderCaller caller, bool skip_filters, int level,
865
+ RangeDelAggregator* range_del_agg,
891
866
  const std::vector<AtomicCompactionUnitBoundary>*
892
867
  compaction_boundaries = nullptr,
893
868
  bool allow_unprepared_value = false)
@@ -907,7 +882,8 @@ class LevelIterator final : public InternalIterator {
907
882
  level_(level),
908
883
  range_del_agg_(range_del_agg),
909
884
  pinned_iters_mgr_(nullptr),
910
- compaction_boundaries_(compaction_boundaries) {
885
+ compaction_boundaries_(compaction_boundaries),
886
+ is_next_read_sequential_(false) {
911
887
  // Empty level is not supported.
912
888
  assert(flevel_ != nullptr && flevel_->num_files > 0);
913
889
  }
@@ -1037,7 +1013,7 @@ class LevelIterator final : public InternalIterator {
1037
1013
  // `prefix_extractor_` may be non-null even for total order seek. Checking
1038
1014
  // this variable is not the right way to identify whether prefix iterator
1039
1015
  // is used.
1040
- const SliceTransform* prefix_extractor_;
1016
+ const std::shared_ptr<const SliceTransform>& prefix_extractor_;
1041
1017
 
1042
1018
  HistogramImpl* file_read_hist_;
1043
1019
  bool should_sample_;
@@ -1054,6 +1030,8 @@ class LevelIterator final : public InternalIterator {
1054
1030
  // To be propagated to RangeDelAggregator in order to safely truncate range
1055
1031
  // tombstones.
1056
1032
  const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
1033
+
1034
+ bool is_next_read_sequential_;
1057
1035
  };
1058
1036
 
1059
1037
  void LevelIterator::Seek(const Slice& target) {
@@ -1155,7 +1133,9 @@ bool LevelIterator::NextAndGetResult(IterateResult* result) {
1155
1133
  assert(Valid());
1156
1134
  bool is_valid = file_iter_.NextAndGetResult(result);
1157
1135
  if (!is_valid) {
1136
+ is_next_read_sequential_ = true;
1158
1137
  SkipEmptyFileForward();
1138
+ is_next_read_sequential_ = false;
1159
1139
  is_valid = Valid();
1160
1140
  if (is_valid) {
1161
1141
  result->key = key();
@@ -1222,6 +1202,12 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
1222
1202
  }
1223
1203
 
1224
1204
  InternalIterator* old_iter = file_iter_.Set(iter);
1205
+
1206
+ // Update the read pattern for PrefetchBuffer.
1207
+ if (is_next_read_sequential_) {
1208
+ file_iter_.UpdateReadaheadState(old_iter);
1209
+ }
1210
+
1225
1211
  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
1226
1212
  pinned_iters_mgr_->PinIterator(old_iter);
1227
1213
  } else {
@@ -1259,7 +1245,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
1259
1245
  auto ioptions = cfd_->ioptions();
1260
1246
  Status s = table_cache->GetTableProperties(
1261
1247
  file_options_, cfd_->internal_comparator(), file_meta->fd, tp,
1262
- mutable_cf_options_.prefix_extractor.get(), true /* no io */);
1248
+ mutable_cf_options_.prefix_extractor, true /* no io */);
1263
1249
  if (s.ok()) {
1264
1250
  return s;
1265
1251
  }
@@ -1287,24 +1273,23 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
1287
1273
  return s;
1288
1274
  }
1289
1275
 
1290
- TableProperties* raw_table_properties;
1291
- // By setting the magic number to kInvalidTableMagicNumber, we can by
1292
- // pass the magic number check in the footer.
1276
+ // By setting the magic number to kNullTableMagicNumber, we can bypass
1277
+ // the magic number check in the footer.
1293
1278
  std::unique_ptr<RandomAccessFileReader> file_reader(
1294
1279
  new RandomAccessFileReader(
1295
1280
  std::move(file), file_name, nullptr /* env */, io_tracer_,
1296
1281
  nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
1297
1282
  nullptr /* rate_limiter */, ioptions->listeners));
1283
+ std::unique_ptr<TableProperties> props;
1298
1284
  s = ReadTableProperties(
1299
1285
  file_reader.get(), file_meta->fd.GetFileSize(),
1300
- Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
1301
- &raw_table_properties, false /* compression_type_missing */);
1286
+ Footer::kNullTableMagicNumber /* table's magic number */, *ioptions,
1287
+ &props);
1302
1288
  if (!s.ok()) {
1303
1289
  return s;
1304
1290
  }
1305
- RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
1306
-
1307
- *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
1291
+ *tp = std::move(props);
1292
+ RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
1308
1293
  return s;
1309
1294
  }
1310
1295
 
@@ -1453,7 +1438,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
1453
1438
  for (size_t i = 0; i < file_level.num_files; i++) {
1454
1439
  total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
1455
1440
  file_options_, cfd_->internal_comparator(), file_level.files[i].fd,
1456
- mutable_cf_options_.prefix_extractor.get());
1441
+ mutable_cf_options_.prefix_extractor);
1457
1442
  }
1458
1443
  }
1459
1444
  return total_usage;
@@ -1468,6 +1453,10 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
1468
1453
  cf_meta->file_count = 0;
1469
1454
  cf_meta->levels.clear();
1470
1455
 
1456
+ cf_meta->blob_file_size = 0;
1457
+ cf_meta->blob_file_count = 0;
1458
+ cf_meta->blob_files.clear();
1459
+
1471
1460
  auto* ioptions = cfd_->ioptions();
1472
1461
  auto* vstorage = storage_info();
1473
1462
 
@@ -1485,15 +1474,16 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
1485
1474
  file_path = ioptions->cf_paths.back().path;
1486
1475
  }
1487
1476
  const uint64_t file_number = file->fd.GetNumber();
1488
- files.emplace_back(SstFileMetaData{
1477
+ files.emplace_back(
1489
1478
  MakeTableFileName("", file_number), file_number, file_path,
1490
- static_cast<size_t>(file->fd.GetFileSize()), file->fd.smallest_seqno,
1479
+ file->fd.GetFileSize(), file->fd.smallest_seqno,
1491
1480
  file->fd.largest_seqno, file->smallest.user_key().ToString(),
1492
1481
  file->largest.user_key().ToString(),
1493
1482
  file->stats.num_reads_sampled.load(std::memory_order_relaxed),
1494
- file->being_compacted, file->oldest_blob_file_number,
1495
- file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(),
1496
- file->file_checksum, file->file_checksum_func_name});
1483
+ file->being_compacted, file->temperature,
1484
+ file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
1485
+ file->TryGetFileCreationTime(), file->file_checksum,
1486
+ file->file_checksum_func_name);
1497
1487
  files.back().num_entries = file->num_entries;
1498
1488
  files.back().num_deletions = file->num_deletions;
1499
1489
  level_size += file->fd.GetFileSize();
@@ -1502,6 +1492,18 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
1502
1492
  level, level_size, std::move(files));
1503
1493
  cf_meta->size += level_size;
1504
1494
  }
1495
+ for (const auto& meta : vstorage->GetBlobFiles()) {
1496
+ assert(meta);
1497
+
1498
+ cf_meta->blob_files.emplace_back(
1499
+ meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()),
1500
+ ioptions->cf_paths.front().path, meta->GetBlobFileSize(),
1501
+ meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(),
1502
+ meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(),
1503
+ meta->GetChecksumMethod(), meta->GetChecksumValue());
1504
+ ++cf_meta->blob_file_count;
1505
+ cf_meta->blob_file_size += meta->GetBlobFileSize();
1506
+ }
1505
1507
  }
1506
1508
 
1507
1509
  uint64_t Version::GetSstFilesSize() {
@@ -1617,7 +1619,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
1617
1619
  merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
1618
1620
  read_options, soptions, cfd_->internal_comparator(),
1619
1621
  *file.file_metadata, range_del_agg,
1620
- mutable_cf_options_.prefix_extractor.get(), nullptr,
1622
+ mutable_cf_options_.prefix_extractor, nullptr,
1621
1623
  cfd_->internal_stats()->GetFileReadHist(0),
1622
1624
  TableReaderCaller::kUserIterator, arena,
1623
1625
  /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
@@ -1641,7 +1643,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
1641
1643
  merge_iter_builder->AddIterator(new (mem) LevelIterator(
1642
1644
  cfd_->table_cache(), read_options, soptions,
1643
1645
  cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
1644
- mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
1646
+ mutable_cf_options_.prefix_extractor, should_sample_file_read(),
1645
1647
  cfd_->internal_stats()->GetFileReadHist(level),
1646
1648
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
1647
1649
  range_del_agg,
@@ -1676,7 +1678,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
1676
1678
  ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
1677
1679
  read_options, file_options, cfd_->internal_comparator(),
1678
1680
  *file->file_metadata, &range_del_agg,
1679
- mutable_cf_options_.prefix_extractor.get(), nullptr,
1681
+ mutable_cf_options_.prefix_extractor, nullptr,
1680
1682
  cfd_->internal_stats()->GetFileReadHist(0),
1681
1683
  TableReaderCaller::kUserIterator, &arena,
1682
1684
  /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
@@ -1694,7 +1696,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
1694
1696
  ScopedArenaIterator iter(new (mem) LevelIterator(
1695
1697
  cfd_->table_cache(), read_options, file_options,
1696
1698
  cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
1697
- mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
1699
+ mutable_cf_options_.prefix_extractor, should_sample_file_read(),
1698
1700
  cfd_->internal_stats()->GetFileReadHist(level),
1699
1701
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
1700
1702
  &range_del_agg));
@@ -1761,14 +1763,14 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
1761
1763
  const std::shared_ptr<IOTracer>& io_tracer,
1762
1764
  uint64_t version_number)
1763
1765
  : env_(vset->env_),
1766
+ clock_(vset->clock_),
1764
1767
  cfd_(column_family_data),
1765
- info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
1766
- db_statistics_((cfd_ == nullptr) ? nullptr
1767
- : cfd_->ioptions()->statistics),
1768
+ info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger),
1769
+ db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats),
1768
1770
  table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
1769
1771
  blob_file_cache_(cfd_ ? cfd_->blob_file_cache() : nullptr),
1770
- merge_operator_((cfd_ == nullptr) ? nullptr
1771
- : cfd_->ioptions()->merge_operator),
1772
+ merge_operator_(
1773
+ (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()),
1772
1774
  storage_info_(
1773
1775
  (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
1774
1776
  (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
@@ -1792,11 +1794,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
1792
1794
 
1793
1795
  Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
1794
1796
  const Slice& blob_index_slice,
1795
- PinnableSlice* value) const {
1796
- if (read_options.read_tier == kBlockCacheTier) {
1797
- return Status::Incomplete("Cannot read blob: no disk I/O allowed");
1798
- }
1799
-
1797
+ FilePrefetchBuffer* prefetch_buffer,
1798
+ PinnableSlice* value, uint64_t* bytes_read) const {
1800
1799
  BlobIndex blob_index;
1801
1800
 
1802
1801
  {
@@ -1806,24 +1805,27 @@ Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
1806
1805
  }
1807
1806
  }
1808
1807
 
1809
- return GetBlob(read_options, user_key, blob_index, value);
1808
+ return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value,
1809
+ bytes_read);
1810
1810
  }
1811
1811
 
1812
1812
  Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
1813
1813
  const BlobIndex& blob_index,
1814
- PinnableSlice* value) const {
1814
+ FilePrefetchBuffer* prefetch_buffer,
1815
+ PinnableSlice* value, uint64_t* bytes_read) const {
1815
1816
  assert(value);
1816
1817
 
1818
+ if (read_options.read_tier == kBlockCacheTier) {
1819
+ return Status::Incomplete("Cannot read blob: no disk I/O allowed");
1820
+ }
1821
+
1817
1822
  if (blob_index.HasTTL() || blob_index.IsInlined()) {
1818
1823
  return Status::Corruption("Unexpected TTL/inlined blob index");
1819
1824
  }
1820
1825
 
1821
- const auto& blob_files = storage_info_.GetBlobFiles();
1822
-
1823
1826
  const uint64_t blob_file_number = blob_index.file_number();
1824
1827
 
1825
- const auto it = blob_files.find(blob_file_number);
1826
- if (it == blob_files.end()) {
1828
+ if (!storage_info_.GetBlobFileMetaData(blob_file_number)) {
1827
1829
  return Status::Corruption("Invalid blob file number");
1828
1830
  }
1829
1831
 
@@ -1841,15 +1843,131 @@ Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
1841
1843
  assert(blob_file_reader.GetValue());
1842
1844
  const Status s = blob_file_reader.GetValue()->GetBlob(
1843
1845
  read_options, user_key, blob_index.offset(), blob_index.size(),
1844
- blob_index.compression(), value);
1846
+ blob_index.compression(), prefetch_buffer, value, bytes_read);
1845
1847
 
1846
1848
  return s;
1847
1849
  }
1848
1850
 
1851
+ void Version::MultiGetBlob(
1852
+ const ReadOptions& read_options, MultiGetRange& range,
1853
+ std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs) {
1854
+ if (read_options.read_tier == kBlockCacheTier) {
1855
+ Status s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
1856
+ for (const auto& elem : blob_rqs) {
1857
+ for (const auto& blob_rq : elem.second) {
1858
+ const KeyContext& key_context = blob_rq.second;
1859
+ assert(key_context.s);
1860
+ assert(key_context.s->ok());
1861
+ *(key_context.s) = s;
1862
+ assert(key_context.get_context);
1863
+ auto& get_context = *(key_context.get_context);
1864
+ get_context.MarkKeyMayExist();
1865
+ }
1866
+ }
1867
+ return;
1868
+ }
1869
+
1870
+ assert(!blob_rqs.empty());
1871
+ Status status;
1872
+
1873
+ for (auto& elem : blob_rqs) {
1874
+ const uint64_t blob_file_number = elem.first;
1875
+
1876
+ if (!storage_info_.GetBlobFileMetaData(blob_file_number)) {
1877
+ auto& blobs_in_file = elem.second;
1878
+ for (const auto& blob : blobs_in_file) {
1879
+ const KeyContext& key_context = blob.second;
1880
+ *(key_context.s) = Status::Corruption("Invalid blob file number");
1881
+ }
1882
+ continue;
1883
+ }
1884
+
1885
+ CacheHandleGuard<BlobFileReader> blob_file_reader;
1886
+ assert(blob_file_cache_);
1887
+ status = blob_file_cache_->GetBlobFileReader(blob_file_number,
1888
+ &blob_file_reader);
1889
+ assert(!status.ok() || blob_file_reader.GetValue());
1890
+
1891
+ auto& blobs_in_file = elem.second;
1892
+ if (!status.ok()) {
1893
+ for (const auto& blob : blobs_in_file) {
1894
+ const KeyContext& key_context = blob.second;
1895
+ *(key_context.s) = status;
1896
+ }
1897
+ continue;
1898
+ }
1899
+
1900
+ assert(blob_file_reader.GetValue());
1901
+ const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
1902
+ const CompressionType compression =
1903
+ blob_file_reader.GetValue()->GetCompressionType();
1904
+
1905
+ // sort blobs_in_file by file offset.
1906
+ std::sort(
1907
+ blobs_in_file.begin(), blobs_in_file.end(),
1908
+ [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
1909
+ assert(lhs.first.file_number() == rhs.first.file_number());
1910
+ return lhs.first.offset() < rhs.first.offset();
1911
+ });
1912
+
1913
+ autovector<std::reference_wrapper<const KeyContext>> blob_read_key_contexts;
1914
+ autovector<std::reference_wrapper<const Slice>> user_keys;
1915
+ autovector<uint64_t> offsets;
1916
+ autovector<uint64_t> value_sizes;
1917
+ autovector<Status*> statuses;
1918
+ autovector<PinnableSlice*> values;
1919
+ for (const auto& blob : blobs_in_file) {
1920
+ const auto& blob_index = blob.first;
1921
+ const KeyContext& key_context = blob.second;
1922
+ if (blob_index.HasTTL() || blob_index.IsInlined()) {
1923
+ *(key_context.s) =
1924
+ Status::Corruption("Unexpected TTL/inlined blob index");
1925
+ continue;
1926
+ }
1927
+ const uint64_t key_size = key_context.ukey_with_ts.size();
1928
+ const uint64_t offset = blob_index.offset();
1929
+ const uint64_t value_size = blob_index.size();
1930
+ if (!IsValidBlobOffset(offset, key_size, value_size, file_size)) {
1931
+ *(key_context.s) = Status::Corruption("Invalid blob offset");
1932
+ continue;
1933
+ }
1934
+ if (blob_index.compression() != compression) {
1935
+ *(key_context.s) =
1936
+ Status::Corruption("Compression type mismatch when reading a blob");
1937
+ continue;
1938
+ }
1939
+ blob_read_key_contexts.emplace_back(std::cref(key_context));
1940
+ user_keys.emplace_back(std::cref(key_context.ukey_with_ts));
1941
+ offsets.push_back(blob_index.offset());
1942
+ value_sizes.push_back(blob_index.size());
1943
+ statuses.push_back(key_context.s);
1944
+ values.push_back(key_context.value);
1945
+ }
1946
+ blob_file_reader.GetValue()->MultiGetBlob(read_options, user_keys, offsets,
1947
+ value_sizes, statuses, values,
1948
+ /*bytes_read=*/nullptr);
1949
+ size_t num = blob_read_key_contexts.size();
1950
+ assert(num == user_keys.size());
1951
+ assert(num == offsets.size());
1952
+ assert(num == value_sizes.size());
1953
+ assert(num == statuses.size());
1954
+ assert(num == values.size());
1955
+ for (size_t i = 0; i < num; ++i) {
1956
+ if (statuses[i]->ok()) {
1957
+ range.AddValueSize(blob_read_key_contexts[i].get().value->size());
1958
+ if (range.GetValueSize() > read_options.value_size_soft_limit) {
1959
+ *(blob_read_key_contexts[i].get().s) = Status::Aborted();
1960
+ }
1961
+ }
1962
+ }
1963
+ }
1964
+ }
1965
+
1849
1966
  void Version::Get(const ReadOptions& read_options, const LookupKey& k,
1850
1967
  PinnableSlice* value, std::string* timestamp, Status* status,
1851
1968
  MergeContext* merge_context,
1852
- SequenceNumber* max_covering_tombstone_seq, bool* value_found,
1969
+ SequenceNumber* max_covering_tombstone_seq,
1970
+ PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
1853
1971
  bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
1854
1972
  bool* is_blob, bool do_merge) {
1855
1973
  Slice ikey = k.internal_key();
@@ -1862,7 +1980,6 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
1862
1980
  *key_exists = true;
1863
1981
  }
1864
1982
 
1865
- PinnedIteratorsManager pinned_iters_mgr;
1866
1983
  uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
1867
1984
  if (vset_ && vset_->block_cache_tracer_ &&
1868
1985
  vset_->block_cache_tracer_->is_tracing_enabled()) {
@@ -1874,24 +1991,26 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
1874
1991
  // need to provide it here.
1875
1992
  bool is_blob_index = false;
1876
1993
  bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
1994
+ BlobFetcher blob_fetcher(this, read_options);
1877
1995
 
1996
+ assert(pinned_iters_mgr);
1878
1997
  GetContext get_context(
1879
1998
  user_comparator(), merge_operator_, info_log_, db_statistics_,
1880
1999
  status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
1881
2000
  do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found,
1882
- merge_context, do_merge, max_covering_tombstone_seq, this->env_, seq,
1883
- merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob_to_use,
1884
- tracing_get_id);
2001
+ merge_context, do_merge, max_covering_tombstone_seq, clock_, seq,
2002
+ merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
2003
+ tracing_get_id, &blob_fetcher);
1885
2004
 
1886
2005
  // Pin blocks that we read to hold merge operands
1887
2006
  if (merge_operator_) {
1888
- pinned_iters_mgr.StartPinning();
2007
+ pinned_iters_mgr->StartPinning();
1889
2008
  }
1890
2009
 
1891
- FilePicker fp(
1892
- storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_,
1893
- storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_,
1894
- user_comparator(), internal_comparator());
2010
+ FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
2011
+ storage_info_.num_non_empty_levels_,
2012
+ &storage_info_.file_indexer_, user_comparator(),
2013
+ internal_comparator());
1895
2014
  FdWithKeyRange* f = fp.GetNextFile();
1896
2015
 
1897
2016
  while (f != nullptr) {
@@ -1907,10 +2026,10 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
1907
2026
  bool timer_enabled =
1908
2027
  GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
1909
2028
  get_perf_context()->per_level_perf_context_enabled;
1910
- StopWatchNano timer(env_, timer_enabled /* auto_start */);
2029
+ StopWatchNano timer(clock_, timer_enabled /* auto_start */);
1911
2030
  *status = table_cache_->Get(
1912
2031
  read_options, *internal_comparator(), *f->file_metadata, ikey,
1913
- &get_context, mutable_cf_options_.prefix_extractor.get(),
2032
+ &get_context, mutable_cf_options_.prefix_extractor,
1914
2033
  cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
1915
2034
  IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
1916
2035
  fp.IsHitFileLastInLevel()),
@@ -1921,6 +2040,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
1921
2040
  fp.GetHitFileLevel());
1922
2041
  }
1923
2042
  if (!status->ok()) {
2043
+ if (db_statistics_ != nullptr) {
2044
+ get_context.ReportCounters();
2045
+ }
1924
2046
  return;
1925
2047
  }
1926
2048
 
@@ -1951,7 +2073,14 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
1951
2073
 
1952
2074
  if (is_blob_index) {
1953
2075
  if (do_merge && value) {
1954
- *status = GetBlob(read_options, user_key, *value, value);
2076
+ TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex",
2077
+ value);
2078
+
2079
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
2080
+ constexpr uint64_t* bytes_read = nullptr;
2081
+
2082
+ *status = GetBlob(read_options, user_key, *value, prefetch_buffer,
2083
+ value, bytes_read);
1955
2084
  if (!status->ok()) {
1956
2085
  if (status->IsIncomplete()) {
1957
2086
  get_context.MarkKeyMayExist();
@@ -1996,7 +2125,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
1996
2125
  std::string* str_value = value != nullptr ? value->GetSelf() : nullptr;
1997
2126
  *status = MergeHelper::TimedFullMerge(
1998
2127
  merge_operator_, user_key, nullptr, merge_context->GetOperands(),
1999
- str_value, info_log_, db_statistics_, env_,
2128
+ str_value, info_log_, db_statistics_, clock_,
2000
2129
  nullptr /* result_operand */, true);
2001
2130
  if (LIKELY(value != nullptr)) {
2002
2131
  value->PinSelf();
@@ -2027,15 +2156,16 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2027
2156
  // use autovector in order to avoid unnecessary construction of GetContext
2028
2157
  // objects, which is expensive
2029
2158
  autovector<GetContext, 16> get_ctx;
2159
+ BlobFetcher blob_fetcher(this, read_options);
2030
2160
  for (auto iter = range->begin(); iter != range->end(); ++iter) {
2031
2161
  assert(iter->s->ok() || iter->s->IsMergeInProgress());
2032
2162
  get_ctx.emplace_back(
2033
2163
  user_comparator(), merge_operator_, info_log_, db_statistics_,
2034
2164
  iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
2035
2165
  iter->ukey_with_ts, iter->value, iter->timestamp, nullptr,
2036
- &(iter->merge_context), true, &iter->max_covering_tombstone_seq,
2037
- this->env_, nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr,
2038
- callback, &iter->is_blob_index, tracing_mget_id);
2166
+ &(iter->merge_context), true, &iter->max_covering_tombstone_seq, clock_,
2167
+ nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
2168
+ &iter->is_blob_index, tracing_mget_id, &blob_fetcher);
2039
2169
  // MergeInProgress status, if set, has been transferred to the get_context
2040
2170
  // state, so we set status to ok here. From now on, the iter status will
2041
2171
  // be used for IO errors, and get_context state will be used for any
@@ -2060,15 +2190,38 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2060
2190
  uint64_t num_data_read = 0;
2061
2191
  uint64_t num_sst_read = 0;
2062
2192
 
2193
+ MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
2194
+ // blob_file => [[blob_idx, it], ...]
2195
+ std::unordered_map<uint64_t, BlobReadRequests> blob_rqs;
2196
+ int level = -1;
2197
+
2063
2198
  while (f != nullptr) {
2064
2199
  MultiGetRange file_range = fp.CurrentFileRange();
2065
2200
  bool timer_enabled =
2066
2201
  GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
2067
2202
  get_perf_context()->per_level_perf_context_enabled;
2068
- StopWatchNano timer(env_, timer_enabled /* auto_start */);
2203
+
2204
+ // Report MultiGet stats per level.
2205
+ if (level >= 0 && level != (int)fp.GetHitFileLevel()) {
2206
+ // Dump the stats if the search has moved to the next level and
2207
+ // reset for next level.
2208
+ RecordInHistogram(db_statistics_,
2209
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2210
+ num_index_read + num_filter_read);
2211
+ RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
2212
+ num_data_read);
2213
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
2214
+ num_filter_read = 0;
2215
+ num_index_read = 0;
2216
+ num_data_read = 0;
2217
+ num_sst_read = 0;
2218
+ level = fp.GetHitFileLevel();
2219
+ }
2220
+
2221
+ StopWatchNano timer(clock_, timer_enabled /* auto_start */);
2069
2222
  s = table_cache_->MultiGet(
2070
2223
  read_options, *internal_comparator(), *f->file_metadata, &file_range,
2071
- mutable_cf_options_.prefix_extractor.get(),
2224
+ mutable_cf_options_.prefix_extractor,
2072
2225
  cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
2073
2226
  IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
2074
2227
  fp.IsHitFileLastInLevel()),
@@ -2109,6 +2262,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2109
2262
  num_filter_read += get_context.get_context_stats_.num_filter_read;
2110
2263
  num_data_read += get_context.get_context_stats_.num_data_read;
2111
2264
  num_sst_read += get_context.get_context_stats_.num_sst_read;
2265
+ // Reset these stats since they're specific to a level
2266
+ get_context.get_context_stats_.num_index_read = 0;
2267
+ get_context.get_context_stats_.num_filter_read = 0;
2268
+ get_context.get_context_stats_.num_data_read = 0;
2269
+ get_context.get_context_stats_.num_sst_read = 0;
2112
2270
 
2113
2271
  // report the counters before returning
2114
2272
  if (get_context.State() != GetContext::kNotFound &&
@@ -2145,22 +2303,27 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2145
2303
 
2146
2304
  if (iter->is_blob_index) {
2147
2305
  if (iter->value) {
2148
- *status = GetBlob(read_options, iter->ukey_with_ts, *iter->value,
2149
- iter->value);
2150
- if (!status->ok()) {
2151
- if (status->IsIncomplete()) {
2152
- get_context.MarkKeyMayExist();
2153
- }
2154
-
2155
- continue;
2306
+ TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex",
2307
+ &(*iter));
2308
+
2309
+ const Slice& blob_index_slice = *(iter->value);
2310
+ BlobIndex blob_index;
2311
+ Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
2312
+ if (tmp_s.ok()) {
2313
+ const uint64_t blob_file_num = blob_index.file_number();
2314
+ blob_rqs[blob_file_num].emplace_back(
2315
+ std::make_pair(blob_index, std::cref(*iter)));
2316
+ } else {
2317
+ *(iter->s) = tmp_s;
2156
2318
  }
2157
2319
  }
2158
- }
2159
-
2160
- file_range.AddValueSize(iter->value->size());
2161
- if (file_range.GetValueSize() > read_options.value_size_soft_limit) {
2162
- s = Status::Aborted();
2163
- break;
2320
+ } else {
2321
+ file_range.AddValueSize(iter->value->size());
2322
+ if (file_range.GetValueSize() >
2323
+ read_options.value_size_soft_limit) {
2324
+ s = Status::Aborted();
2325
+ break;
2326
+ }
2164
2327
  }
2165
2328
  continue;
2166
2329
  case GetContext::kDeleted:
@@ -2183,22 +2346,6 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2183
2346
  }
2184
2347
  }
2185
2348
 
2186
- // Report MultiGet stats per level.
2187
- if (fp.IsHitFileLastInLevel()) {
2188
- // Dump the stats if this is the last file of this level and reset for
2189
- // next level.
2190
- RecordInHistogram(db_statistics_,
2191
- NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2192
- num_index_read + num_filter_read);
2193
- RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
2194
- num_data_read);
2195
- RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
2196
- num_filter_read = 0;
2197
- num_index_read = 0;
2198
- num_data_read = 0;
2199
- num_sst_read = 0;
2200
- }
2201
-
2202
2349
  RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
2203
2350
  if (!s.ok() || file_picker_range.empty()) {
2204
2351
  break;
@@ -2206,6 +2353,17 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2206
2353
  f = fp.GetNextFile();
2207
2354
  }
2208
2355
 
2356
+ // Dump stats for most recent level
2357
+ RecordInHistogram(db_statistics_, NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2358
+ num_index_read + num_filter_read);
2359
+ RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
2360
+ num_data_read);
2361
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
2362
+
2363
+ if (s.ok() && !blob_rqs.empty()) {
2364
+ MultiGetBlob(read_options, keys_with_blobs_range, blob_rqs);
2365
+ }
2366
+
2209
2367
  // Process any left over keys
2210
2368
  for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) {
2211
2369
  GetContext& get_context = *iter->get_context;
@@ -2228,7 +2386,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2228
2386
  iter->value != nullptr ? iter->value->GetSelf() : nullptr;
2229
2387
  *status = MergeHelper::TimedFullMerge(
2230
2388
  merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
2231
- str_value, info_log_, db_statistics_, env_,
2389
+ str_value, info_log_, db_statistics_, clock_,
2232
2390
  nullptr /* result_operand */, true);
2233
2391
  if (LIKELY(iter->value != nullptr)) {
2234
2392
  iter->value->PinSelf();
@@ -2267,20 +2425,31 @@ void VersionStorageInfo::GenerateLevelFilesBrief() {
2267
2425
  }
2268
2426
  }
2269
2427
 
2270
- void Version::PrepareApply(
2271
- const MutableCFOptions& mutable_cf_options,
2272
- bool update_stats) {
2428
+ void VersionStorageInfo::PrepareForVersionAppend(
2429
+ const ImmutableOptions& immutable_options,
2430
+ const MutableCFOptions& mutable_cf_options) {
2431
+ ComputeCompensatedSizes();
2432
+ UpdateNumNonEmptyLevels();
2433
+ CalculateBaseBytes(immutable_options, mutable_cf_options);
2434
+ UpdateFilesByCompactionPri(immutable_options, mutable_cf_options);
2435
+ GenerateFileIndexer();
2436
+ GenerateLevelFilesBrief();
2437
+ GenerateLevel0NonOverlapping();
2438
+ GenerateBottommostFiles();
2439
+ GenerateFileLocationIndex();
2440
+ }
2441
+
2442
+ void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options,
2443
+ bool update_stats) {
2273
2444
  TEST_SYNC_POINT_CALLBACK(
2274
- "Version::PrepareApply:forced_check",
2445
+ "Version::PrepareAppend:forced_check",
2275
2446
  reinterpret_cast<void*>(&storage_info_.force_consistency_checks_));
2276
- UpdateAccumulatedStats(update_stats);
2277
- storage_info_.UpdateNumNonEmptyLevels();
2278
- storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
2279
- storage_info_.UpdateFilesByCompactionPri(cfd_->ioptions()->compaction_pri);
2280
- storage_info_.GenerateFileIndexer();
2281
- storage_info_.GenerateLevelFilesBrief();
2282
- storage_info_.GenerateLevel0NonOverlapping();
2283
- storage_info_.GenerateBottommostFiles();
2447
+
2448
+ if (update_stats) {
2449
+ UpdateAccumulatedStats();
2450
+ }
2451
+
2452
+ storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options);
2284
2453
  }
2285
2454
 
2286
2455
  bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
@@ -2334,59 +2503,54 @@ void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
2334
2503
  }
2335
2504
  }
2336
2505
 
2337
- void Version::UpdateAccumulatedStats(bool update_stats) {
2338
- if (update_stats) {
2339
- // maximum number of table properties loaded from files.
2340
- const int kMaxInitCount = 20;
2341
- int init_count = 0;
2342
- // here only the first kMaxInitCount files which haven't been
2343
- // initialized from file will be updated with num_deletions.
2344
- // The motivation here is to cap the maximum I/O per Version creation.
2345
- // The reason for choosing files from lower-level instead of higher-level
2346
- // is that such design is able to propagate the initialization from
2347
- // lower-level to higher-level: When the num_deletions of lower-level
2348
- // files are updated, it will make the lower-level files have accurate
2349
- // compensated_file_size, making lower-level to higher-level compaction
2350
- // will be triggered, which creates higher-level files whose num_deletions
2351
- // will be updated here.
2352
- for (int level = 0;
2353
- level < storage_info_.num_levels_ && init_count < kMaxInitCount;
2354
- ++level) {
2355
- for (auto* file_meta : storage_info_.files_[level]) {
2356
- if (MaybeInitializeFileMetaData(file_meta)) {
2357
- // each FileMeta will be initialized only once.
2358
- storage_info_.UpdateAccumulatedStats(file_meta);
2359
- // when option "max_open_files" is -1, all the file metadata has
2360
- // already been read, so MaybeInitializeFileMetaData() won't incur
2361
- // any I/O cost. "max_open_files=-1" means that the table cache passed
2362
- // to the VersionSet and then to the ColumnFamilySet has a size of
2363
- // TableCache::kInfiniteCapacity
2364
- if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
2365
- TableCache::kInfiniteCapacity) {
2366
- continue;
2367
- }
2368
- if (++init_count >= kMaxInitCount) {
2369
- break;
2370
- }
2506
+ void Version::UpdateAccumulatedStats() {
2507
+ // maximum number of table properties loaded from files.
2508
+ const int kMaxInitCount = 20;
2509
+ int init_count = 0;
2510
+ // here only the first kMaxInitCount files which haven't been
2511
+ // initialized from file will be updated with num_deletions.
2512
+ // The motivation here is to cap the maximum I/O per Version creation.
2513
+ // The reason for choosing files from lower-level instead of higher-level
2514
+ // is that such design is able to propagate the initialization from
2515
+ // lower-level to higher-level: When the num_deletions of lower-level
2516
+ // files are updated, it will make the lower-level files have accurate
2517
+ // compensated_file_size, making lower-level to higher-level compaction
2518
+ // will be triggered, which creates higher-level files whose num_deletions
2519
+ // will be updated here.
2520
+ for (int level = 0;
2521
+ level < storage_info_.num_levels_ && init_count < kMaxInitCount;
2522
+ ++level) {
2523
+ for (auto* file_meta : storage_info_.files_[level]) {
2524
+ if (MaybeInitializeFileMetaData(file_meta)) {
2525
+ // each FileMeta will be initialized only once.
2526
+ storage_info_.UpdateAccumulatedStats(file_meta);
2527
+ // when option "max_open_files" is -1, all the file metadata has
2528
+ // already been read, so MaybeInitializeFileMetaData() won't incur
2529
+ // any I/O cost. "max_open_files=-1" means that the table cache passed
2530
+ // to the VersionSet and then to the ColumnFamilySet has a size of
2531
+ // TableCache::kInfiniteCapacity
2532
+ if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
2533
+ TableCache::kInfiniteCapacity) {
2534
+ continue;
2535
+ }
2536
+ if (++init_count >= kMaxInitCount) {
2537
+ break;
2371
2538
  }
2372
2539
  }
2373
2540
  }
2374
- // In case all sampled-files contain only deletion entries, then we
2375
- // load the table-property of a file in higher-level to initialize
2376
- // that value.
2377
- for (int level = storage_info_.num_levels_ - 1;
2378
- storage_info_.accumulated_raw_value_size_ == 0 && level >= 0;
2379
- --level) {
2380
- for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
2381
- storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
2382
- if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
2383
- storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
2384
- }
2541
+ }
2542
+ // In case all sampled-files contain only deletion entries, then we
2543
+ // load the table-property of a file in higher-level to initialize
2544
+ // that value.
2545
+ for (int level = storage_info_.num_levels_ - 1;
2546
+ storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
2547
+ for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
2548
+ storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
2549
+ if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
2550
+ storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
2385
2551
  }
2386
2552
  }
2387
2553
  }
2388
-
2389
- storage_info_.ComputeCompensatedSizes();
2390
2554
  }
2391
2555
 
2392
2556
  void VersionStorageInfo::ComputeCompensatedSizes() {
@@ -2521,13 +2685,13 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded(
2521
2685
  }
2522
2686
 
2523
2687
  namespace {
2524
- uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
2688
+ uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
2525
2689
  const MutableCFOptions& mutable_cf_options,
2526
2690
  const std::vector<FileMetaData*>& files) {
2527
2691
  uint32_t ttl_expired_files_count = 0;
2528
2692
 
2529
2693
  int64_t _current_time;
2530
- auto status = ioptions.env->GetCurrentTime(&_current_time);
2694
+ auto status = ioptions.clock->GetCurrentTime(&_current_time);
2531
2695
  if (status.ok()) {
2532
2696
  const uint64_t current_time = static_cast<uint64_t>(_current_time);
2533
2697
  for (FileMetaData* f : files) {
@@ -2545,7 +2709,7 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
2545
2709
  } // anonymous namespace
2546
2710
 
2547
2711
  void VersionStorageInfo::ComputeCompactionScore(
2548
- const ImmutableCFOptions& immutable_cf_options,
2712
+ const ImmutableOptions& immutable_options,
2549
2713
  const MutableCFOptions& mutable_cf_options) {
2550
2714
  for (int level = 0; level <= MaxInputLevel(); level++) {
2551
2715
  double score;
@@ -2588,7 +2752,12 @@ void VersionStorageInfo::ComputeCompactionScore(
2588
2752
  if (compaction_style_ == kCompactionStyleFIFO) {
2589
2753
  score = static_cast<double>(total_size) /
2590
2754
  mutable_cf_options.compaction_options_fifo.max_table_files_size;
2591
- if (mutable_cf_options.compaction_options_fifo.allow_compaction) {
2755
+ if (mutable_cf_options.compaction_options_fifo.allow_compaction ||
2756
+ mutable_cf_options.compaction_options_fifo.age_for_warm > 0) {
2757
+ // Warm tier move can happen at any time. It's too expensive to
2758
+ // check very file's timestamp now. For now, just trigger it
2759
+ // slightly more frequently than FIFO compaction so that this
2760
+ // happens first.
2592
2761
  score = std::max(
2593
2762
  static_cast<double>(num_sorted_runs) /
2594
2763
  mutable_cf_options.level0_file_num_compaction_trigger,
@@ -2597,10 +2766,9 @@ void VersionStorageInfo::ComputeCompactionScore(
2597
2766
  if (mutable_cf_options.ttl > 0) {
2598
2767
  score = std::max(
2599
2768
  static_cast<double>(GetExpiredTtlFilesCount(
2600
- immutable_cf_options, mutable_cf_options, files_[level])),
2769
+ immutable_options, mutable_cf_options, files_[level])),
2601
2770
  score);
2602
2771
  }
2603
-
2604
2772
  } else {
2605
2773
  score = static_cast<double>(num_sorted_runs) /
2606
2774
  mutable_cf_options.level0_file_num_compaction_trigger;
@@ -2609,7 +2777,7 @@ void VersionStorageInfo::ComputeCompactionScore(
2609
2777
  // L0 files. Take into account size as well to avoid later giant
2610
2778
  // compactions to the base level.
2611
2779
  uint64_t l0_target_size = mutable_cf_options.max_bytes_for_level_base;
2612
- if (immutable_cf_options.level_compaction_dynamic_level_bytes &&
2780
+ if (immutable_options.level_compaction_dynamic_level_bytes &&
2613
2781
  level_multiplier_ != 0.0) {
2614
2782
  // Prevent L0 to Lbase fanout from growing larger than
2615
2783
  // `level_multiplier_`. This prevents us from getting stuck picking
@@ -2657,12 +2825,21 @@ void VersionStorageInfo::ComputeCompactionScore(
2657
2825
  ComputeFilesMarkedForCompaction();
2658
2826
  ComputeBottommostFilesMarkedForCompaction();
2659
2827
  if (mutable_cf_options.ttl > 0) {
2660
- ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl);
2828
+ ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
2661
2829
  }
2662
2830
  if (mutable_cf_options.periodic_compaction_seconds > 0) {
2663
2831
  ComputeFilesMarkedForPeriodicCompaction(
2664
- immutable_cf_options, mutable_cf_options.periodic_compaction_seconds);
2832
+ immutable_options, mutable_cf_options.periodic_compaction_seconds);
2833
+ }
2834
+
2835
+ if (mutable_cf_options.enable_blob_garbage_collection &&
2836
+ mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 &&
2837
+ mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) {
2838
+ ComputeFilesMarkedForForcedBlobGC(
2839
+ mutable_cf_options.blob_garbage_collection_age_cutoff,
2840
+ mutable_cf_options.blob_garbage_collection_force_threshold);
2665
2841
  }
2842
+
2666
2843
  EstimateCompactionBytesNeeded(mutable_cf_options);
2667
2844
  }
2668
2845
 
@@ -2690,13 +2867,13 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
2690
2867
  }
2691
2868
 
2692
2869
  void VersionStorageInfo::ComputeExpiredTtlFiles(
2693
- const ImmutableCFOptions& ioptions, const uint64_t ttl) {
2870
+ const ImmutableOptions& ioptions, const uint64_t ttl) {
2694
2871
  assert(ttl > 0);
2695
2872
 
2696
2873
  expired_ttl_files_.clear();
2697
2874
 
2698
2875
  int64_t _current_time;
2699
- auto status = ioptions.env->GetCurrentTime(&_current_time);
2876
+ auto status = ioptions.clock->GetCurrentTime(&_current_time);
2700
2877
  if (!status.ok()) {
2701
2878
  return;
2702
2879
  }
@@ -2716,14 +2893,14 @@ void VersionStorageInfo::ComputeExpiredTtlFiles(
2716
2893
  }
2717
2894
 
2718
2895
  void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
2719
- const ImmutableCFOptions& ioptions,
2896
+ const ImmutableOptions& ioptions,
2720
2897
  const uint64_t periodic_compaction_seconds) {
2721
2898
  assert(periodic_compaction_seconds > 0);
2722
2899
 
2723
2900
  files_marked_for_periodic_compaction_.clear();
2724
2901
 
2725
2902
  int64_t temp_current_time;
2726
- auto status = ioptions.env->GetCurrentTime(&temp_current_time);
2903
+ auto status = ioptions.clock->GetCurrentTime(&temp_current_time);
2727
2904
  if (!status.ok()) {
2728
2905
  return;
2729
2906
  }
@@ -2757,7 +2934,7 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
2757
2934
  status = ioptions.env->GetFileModificationTime(
2758
2935
  file_path, &file_modification_time);
2759
2936
  if (!status.ok()) {
2760
- ROCKS_LOG_WARN(ioptions.info_log,
2937
+ ROCKS_LOG_WARN(ioptions.logger,
2761
2938
  "Can't get file modification time: %s: %s",
2762
2939
  file_path.c_str(), status.ToString().c_str());
2763
2940
  continue;
@@ -2772,6 +2949,112 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
2772
2949
  }
2773
2950
  }
2774
2951
 
2952
+ void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
2953
+ double blob_garbage_collection_age_cutoff,
2954
+ double blob_garbage_collection_force_threshold) {
2955
+ files_marked_for_forced_blob_gc_.clear();
2956
+
2957
+ if (blob_files_.empty()) {
2958
+ return;
2959
+ }
2960
+
2961
+ // Number of blob files eligible for GC based on age
2962
+ const size_t cutoff_count = static_cast<size_t>(
2963
+ blob_garbage_collection_age_cutoff * blob_files_.size());
2964
+ if (!cutoff_count) {
2965
+ return;
2966
+ }
2967
+
2968
+ // Compute the sum of total and garbage bytes over the oldest batch of blob
2969
+ // files. The oldest batch is defined as the set of blob files which are
2970
+ // kept alive by the same SSTs as the very oldest one. Here is a toy example.
2971
+ // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11,
2972
+ // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and
2973
+ // potentially some higher-numbered ones, while SST 3 relies on blob file 12
2974
+ // and potentially some higher-numbered ones. Then, the SST to oldest blob
2975
+ // file mapping is as follows:
2976
+ //
2977
+ // SST file number Oldest blob file number
2978
+ // 1 10
2979
+ // 2 10
2980
+ // 3 12
2981
+ //
2982
+ // This is what the same thing looks like from the blob files' POV. (Note that
2983
+ // the linked SSTs simply denote the inverse mapping of the above.)
2984
+ //
2985
+ // Blob file number Linked SST set
2986
+ // 10 {1, 2}
2987
+ // 11 {}
2988
+ // 12 {3}
2989
+ // 13 {}
2990
+ //
2991
+ // Then, the oldest batch of blob files consists of blob files 10 and 11,
2992
+ // and we can get rid of them by forcing the compaction of SSTs 1 and 2.
2993
+ //
2994
+ // Note that the overall ratio of garbage computed for the batch has to exceed
2995
+ // blob_garbage_collection_force_threshold and the entire batch has to be
2996
+ // eligible for GC according to blob_garbage_collection_age_cutoff in order
2997
+ // for us to schedule any compactions.
2998
+ const auto& oldest_meta = blob_files_.front();
2999
+ assert(oldest_meta);
3000
+
3001
+ const auto& linked_ssts = oldest_meta->GetLinkedSsts();
3002
+ assert(!linked_ssts.empty());
3003
+
3004
+ size_t count = 1;
3005
+ uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes();
3006
+ uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes();
3007
+
3008
+ assert(cutoff_count <= blob_files_.size());
3009
+
3010
+ for (; count < cutoff_count; ++count) {
3011
+ const auto& meta = blob_files_[count];
3012
+ assert(meta);
3013
+
3014
+ if (!meta->GetLinkedSsts().empty()) {
3015
+ // Found the beginning of the next batch of blob files
3016
+ break;
3017
+ }
3018
+
3019
+ sum_total_blob_bytes += meta->GetTotalBlobBytes();
3020
+ sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
3021
+ }
3022
+
3023
+ if (count < blob_files_.size()) {
3024
+ const auto& meta = blob_files_[count];
3025
+ assert(meta);
3026
+
3027
+ if (meta->GetLinkedSsts().empty()) {
3028
+ // Some files in the oldest batch are not eligible for GC
3029
+ return;
3030
+ }
3031
+ }
3032
+
3033
+ if (sum_garbage_blob_bytes <
3034
+ blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
3035
+ return;
3036
+ }
3037
+
3038
+ for (uint64_t sst_file_number : linked_ssts) {
3039
+ const FileLocation location = GetFileLocation(sst_file_number);
3040
+ assert(location.IsValid());
3041
+
3042
+ const int level = location.GetLevel();
3043
+ assert(level >= 0);
3044
+
3045
+ const size_t pos = location.GetPosition();
3046
+
3047
+ FileMetaData* const sst_meta = files_[level][pos];
3048
+ assert(sst_meta);
3049
+
3050
+ if (sst_meta->being_compacted) {
3051
+ continue;
3052
+ }
3053
+
3054
+ files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta);
3055
+ }
3056
+ }
3057
+
2775
3058
  namespace {
2776
3059
 
2777
3060
  // used to sort files by size
@@ -2780,7 +3063,7 @@ struct Fsize {
2780
3063
  FileMetaData* file;
2781
3064
  };
2782
3065
 
2783
- // Compator that is used to sort files based on their size
3066
+ // Comparator that is used to sort files based on their size
2784
3067
  // In normal mode: descending size
2785
3068
  bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
2786
3069
  return (first.file->compensated_file_size >
@@ -2793,38 +3076,32 @@ void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
2793
3076
  level_files.push_back(f);
2794
3077
 
2795
3078
  f->refs++;
2796
-
2797
- const uint64_t file_number = f->fd.GetNumber();
2798
-
2799
- assert(file_locations_.find(file_number) == file_locations_.end());
2800
- file_locations_.emplace(file_number,
2801
- FileLocation(level, level_files.size() - 1));
2802
3079
  }
2803
3080
 
2804
3081
  void VersionStorageInfo::AddBlobFile(
2805
3082
  std::shared_ptr<BlobFileMetaData> blob_file_meta) {
2806
3083
  assert(blob_file_meta);
2807
3084
 
2808
- const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber();
3085
+ assert(blob_files_.empty() ||
3086
+ (blob_files_.back() && blob_files_.back()->GetBlobFileNumber() <
3087
+ blob_file_meta->GetBlobFileNumber()));
2809
3088
 
2810
- auto it = blob_files_.lower_bound(blob_file_number);
2811
- assert(it == blob_files_.end() || it->first != blob_file_number);
3089
+ blob_files_.emplace_back(std::move(blob_file_meta));
3090
+ }
2812
3091
 
2813
- blob_files_.insert(
2814
- it, BlobFiles::value_type(blob_file_number, std::move(blob_file_meta)));
3092
+ VersionStorageInfo::BlobFiles::const_iterator
3093
+ VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const {
3094
+ return std::lower_bound(
3095
+ blob_files_.begin(), blob_files_.end(), blob_file_number,
3096
+ [](const std::shared_ptr<BlobFileMetaData>& lhs, uint64_t rhs) {
3097
+ assert(lhs);
3098
+ return lhs->GetBlobFileNumber() < rhs;
3099
+ });
2815
3100
  }
2816
3101
 
2817
- // Version::PrepareApply() need to be called before calling the function, or
2818
- // following functions called:
2819
- // 1. UpdateNumNonEmptyLevels();
2820
- // 2. CalculateBaseBytes();
2821
- // 3. UpdateFilesByCompactionPri();
2822
- // 4. GenerateFileIndexer();
2823
- // 5. GenerateLevelFilesBrief();
2824
- // 6. GenerateLevel0NonOverlapping();
2825
- // 7. GenerateBottommostFiles();
2826
3102
  void VersionStorageInfo::SetFinalized() {
2827
3103
  finalized_ = true;
3104
+
2828
3105
  #ifndef NDEBUG
2829
3106
  if (compaction_style_ != kCompactionStyleLevel) {
2830
3107
  // Not level based compaction.
@@ -2875,11 +3152,22 @@ namespace {
2875
3152
  // Sort `temp` based on ratio of overlapping size over file size
2876
3153
  void SortFileByOverlappingRatio(
2877
3154
  const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
2878
- const std::vector<FileMetaData*>& next_level_files,
3155
+ const std::vector<FileMetaData*>& next_level_files, SystemClock* clock,
3156
+ int level, int num_non_empty_levels, uint64_t ttl,
2879
3157
  std::vector<Fsize>* temp) {
2880
3158
  std::unordered_map<uint64_t, uint64_t> file_to_order;
2881
3159
  auto next_level_it = next_level_files.begin();
2882
3160
 
3161
+ int64_t curr_time;
3162
+ Status status = clock->GetCurrentTime(&curr_time);
3163
+ if (!status.ok()) {
3164
+ // If we can't get time, disable TTL.
3165
+ ttl = 0;
3166
+ }
3167
+
3168
+ FileTtlBooster ttl_booster(static_cast<uint64_t>(curr_time), ttl,
3169
+ num_non_empty_levels, level);
3170
+
2883
3171
  for (auto& file : files) {
2884
3172
  uint64_t overlapping_bytes = 0;
2885
3173
  // Skip files in next level that is smaller than current file
@@ -2899,9 +3187,12 @@ void SortFileByOverlappingRatio(
2899
3187
  next_level_it++;
2900
3188
  }
2901
3189
 
3190
+ uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1;
3191
+ assert(ttl_boost_score > 0);
2902
3192
  assert(file->compensated_file_size != 0);
2903
- file_to_order[file->fd.GetNumber()] =
2904
- overlapping_bytes * 1024u / file->compensated_file_size;
3193
+ file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U /
3194
+ file->compensated_file_size /
3195
+ ttl_boost_score;
2905
3196
  }
2906
3197
 
2907
3198
  std::sort(temp->begin(), temp->end(),
@@ -2913,7 +3204,7 @@ void SortFileByOverlappingRatio(
2913
3204
  } // namespace
2914
3205
 
2915
3206
  void VersionStorageInfo::UpdateFilesByCompactionPri(
2916
- CompactionPri compaction_pri) {
3207
+ const ImmutableOptions& ioptions, const MutableCFOptions& options) {
2917
3208
  if (compaction_style_ == kCompactionStyleNone ||
2918
3209
  compaction_style_ == kCompactionStyleFIFO ||
2919
3210
  compaction_style_ == kCompactionStyleUniversal) {
@@ -2938,7 +3229,7 @@ void VersionStorageInfo::UpdateFilesByCompactionPri(
2938
3229
  if (num > temp.size()) {
2939
3230
  num = temp.size();
2940
3231
  }
2941
- switch (compaction_pri) {
3232
+ switch (ioptions.compaction_pri) {
2942
3233
  case kByCompensatedSize:
2943
3234
  std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
2944
3235
  CompareCompensatedSizeDescending);
@@ -2959,7 +3250,8 @@ void VersionStorageInfo::UpdateFilesByCompactionPri(
2959
3250
  break;
2960
3251
  case kMinOverlappingRatio:
2961
3252
  SortFileByOverlappingRatio(*internal_comparator_, files_[level],
2962
- files_[level + 1], &temp);
3253
+ files_[level + 1], ioptions.clock, level,
3254
+ num_non_empty_levels_, options.ttl, &temp);
2963
3255
  break;
2964
3256
  default:
2965
3257
  assert(false);
@@ -3027,6 +3319,28 @@ void VersionStorageInfo::GenerateBottommostFiles() {
3027
3319
  }
3028
3320
  }
3029
3321
 
3322
+ void VersionStorageInfo::GenerateFileLocationIndex() {
3323
+ size_t num_files = 0;
3324
+
3325
+ for (int level = 0; level < num_levels_; ++level) {
3326
+ num_files += files_[level].size();
3327
+ }
3328
+
3329
+ file_locations_.reserve(num_files);
3330
+
3331
+ for (int level = 0; level < num_levels_; ++level) {
3332
+ for (size_t pos = 0; pos < files_[level].size(); ++pos) {
3333
+ const FileMetaData* const meta = files_[level][pos];
3334
+ assert(meta);
3335
+
3336
+ const uint64_t file_number = meta->fd.GetNumber();
3337
+
3338
+ assert(file_locations_.find(file_number) == file_locations_.end());
3339
+ file_locations_.emplace(file_number, FileLocation(level, pos));
3340
+ }
3341
+ }
3342
+ }
3343
+
3030
3344
  void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) {
3031
3345
  assert(seqnum >= oldest_snapshot_seqnum_);
3032
3346
  oldest_snapshot_seqnum_ = seqnum;
@@ -3040,8 +3354,7 @@ void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() {
3040
3354
  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
3041
3355
  for (auto& level_and_file : bottommost_files_) {
3042
3356
  if (!level_and_file.second->being_compacted &&
3043
- level_and_file.second->fd.largest_seqno != 0 &&
3044
- level_and_file.second->num_deletions > 1) {
3357
+ level_and_file.second->fd.largest_seqno != 0) {
3045
3358
  // largest_seqno might be nonzero due to containing the final key in an
3046
3359
  // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
3047
3360
  // ensures the file really contains deleted or overwritten keys.
@@ -3200,7 +3513,7 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
3200
3513
  // specified range. From that file, iterate backwards and
3201
3514
  // forwards to find all overlapping files.
3202
3515
  // if within_range is set, then only store the maximum clean inputs
3203
- // within range [begin, end]. "clean" means there is a boudnary
3516
+ // within range [begin, end]. "clean" means there is a boundary
3204
3517
  // between the files in "*inputs" and the surrounding files
3205
3518
  void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
3206
3519
  int level, const InternalKey* begin, const InternalKey* end,
@@ -3367,7 +3680,7 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
3367
3680
  return scratch->buffer;
3368
3681
  }
3369
3682
 
3370
- int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
3683
+ uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
3371
3684
  uint64_t result = 0;
3372
3685
  std::vector<FileMetaData*> overlaps;
3373
3686
  for (int level = 1; level < num_levels() - 1; level++) {
@@ -3390,7 +3703,7 @@ uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
3390
3703
  return level_max_bytes_[level];
3391
3704
  }
3392
3705
 
3393
- void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
3706
+ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
3394
3707
  const MutableCFOptions& options) {
3395
3708
  // Special logic to set number of sorted runs.
3396
3709
  // It is to match the previous behavior when all files are in L0.
@@ -3480,7 +3793,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
3480
3793
  // base_bytes_min. We set it be base_bytes_min.
3481
3794
  base_level_size = base_bytes_min + 1U;
3482
3795
  base_level_ = first_non_empty_level;
3483
- ROCKS_LOG_INFO(ioptions.info_log,
3796
+ ROCKS_LOG_INFO(ioptions.logger,
3484
3797
  "More existing levels in DB than needed. "
3485
3798
  "max_bytes_for_level_multiplier may not be guaranteed.");
3486
3799
  } else {
@@ -3511,7 +3824,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
3511
3824
  // 1. the L0 size is larger than level size base, or
3512
3825
  // 2. number of L0 files reaches twice the L0->L1 compaction trigger
3513
3826
  // We don't do this otherwise to keep the LSM-tree structure stable
3514
- // unless the L0 compation is backlogged.
3827
+ // unless the L0 compaction is backlogged.
3515
3828
  base_level_size = l0_size;
3516
3829
  if (base_level_ == num_levels_ - 1) {
3517
3830
  level_multiplier_ = 1.0;
@@ -3570,6 +3883,16 @@ uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
3570
3883
  }
3571
3884
  }
3572
3885
  }
3886
+
3887
+ // For BlobDB, the result also includes the exact value of live bytes in the
3888
+ // blob files of the version.
3889
+ for (const auto& meta : blob_files_) {
3890
+ assert(meta);
3891
+
3892
+ size += meta->GetTotalBlobBytes();
3893
+ size -= meta->GetGarbageBlobBytes();
3894
+ }
3895
+
3573
3896
  return size;
3574
3897
  }
3575
3898
 
@@ -3619,8 +3942,7 @@ void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
3619
3942
  }
3620
3943
 
3621
3944
  const auto& blob_files = storage_info_.GetBlobFiles();
3622
- for (const auto& pair : blob_files) {
3623
- const auto& meta = pair.second;
3945
+ for (const auto& meta : blob_files) {
3624
3946
  assert(meta);
3625
3947
 
3626
3948
  live_blob_files->emplace_back(meta->GetBlobFileNumber());
@@ -3677,8 +3999,7 @@ std::string Version::DebugString(bool hex, bool print_stats) const {
3677
3999
  r.append("--- blob files --- version# ");
3678
4000
  AppendNumberTo(&r, version_number_);
3679
4001
  r.append(" ---\n");
3680
- for (const auto& pair : blob_files) {
3681
- const auto& blob_file_meta = pair.second;
4002
+ for (const auto& blob_file_meta : blob_files) {
3682
4003
  assert(blob_file_meta);
3683
4004
 
3684
4005
  r.append(blob_file_meta->DebugString());
@@ -3774,19 +4095,22 @@ VersionSet::VersionSet(const std::string& dbname,
3774
4095
  WriteBufferManager* write_buffer_manager,
3775
4096
  WriteController* write_controller,
3776
4097
  BlockCacheTracer* const block_cache_tracer,
3777
- const std::shared_ptr<IOTracer>& io_tracer)
4098
+ const std::shared_ptr<IOTracer>& io_tracer,
4099
+ const std::string& db_session_id)
3778
4100
  : column_family_set_(
3779
4101
  new ColumnFamilySet(dbname, _db_options, storage_options, table_cache,
3780
4102
  write_buffer_manager, write_controller,
3781
- block_cache_tracer, io_tracer)),
4103
+ block_cache_tracer, io_tracer, db_session_id)),
3782
4104
  table_cache_(table_cache),
3783
4105
  env_(_db_options->env),
3784
4106
  fs_(_db_options->fs, io_tracer),
4107
+ clock_(_db_options->clock),
3785
4108
  dbname_(dbname),
3786
4109
  db_options_(_db_options),
3787
4110
  next_file_number_(2),
3788
4111
  manifest_file_number_(0), // Filled by Recover()
3789
4112
  options_file_number_(0),
4113
+ options_file_size_(0),
3790
4114
  pending_manifest_file_number_(0),
3791
4115
  last_sequence_(0),
3792
4116
  last_allocated_sequence_(0),
@@ -3796,7 +4120,8 @@ VersionSet::VersionSet(const std::string& dbname,
3796
4120
  manifest_file_size_(0),
3797
4121
  file_options_(storage_options),
3798
4122
  block_cache_tracer_(block_cache_tracer),
3799
- io_tracer_(io_tracer) {}
4123
+ io_tracer_(io_tracer),
4124
+ db_session_id_(db_session_id) {}
3800
4125
 
3801
4126
  VersionSet::~VersionSet() {
3802
4127
  // we need to delete column_family_set_ because its destructor depends on
@@ -3817,13 +4142,13 @@ void VersionSet::Reset() {
3817
4142
  if (column_family_set_) {
3818
4143
  WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
3819
4144
  WriteController* wc = column_family_set_->write_controller();
3820
- column_family_set_.reset(
3821
- new ColumnFamilySet(dbname_, db_options_, file_options_, table_cache_,
3822
- wbm, wc, block_cache_tracer_, io_tracer_));
4145
+ column_family_set_.reset(new ColumnFamilySet(
4146
+ dbname_, db_options_, file_options_, table_cache_, wbm, wc,
4147
+ block_cache_tracer_, io_tracer_, db_session_id_));
3823
4148
  }
3824
4149
  db_id_.clear();
3825
4150
  next_file_number_.store(2);
3826
- min_log_number_to_keep_2pc_.store(0);
4151
+ min_log_number_to_keep_.store(0);
3827
4152
  manifest_file_number_ = 0;
3828
4153
  options_file_number_ = 0;
3829
4154
  pending_manifest_file_number_ = 0;
@@ -3885,9 +4210,16 @@ Status VersionSet::ProcessManifestWrites(
3885
4210
  autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
3886
4211
  std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
3887
4212
 
4213
+ // Tracking `max_last_sequence` is needed to ensure we write
4214
+ // `VersionEdit::last_sequence_`s in non-decreasing order according to the
4215
+ // recovery code's requirement. It also allows us to defer updating
4216
+ // `descriptor_last_sequence_` until the apply phase, after the log phase
4217
+ // succeeds.
4218
+ SequenceNumber max_last_sequence = descriptor_last_sequence_;
4219
+
3888
4220
  if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
3889
4221
  // No group commits for column family add or drop
3890
- LogAndApplyCFHelper(first_writer.edit_list.front());
4222
+ LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence);
3891
4223
  batch_edits.push_back(first_writer.edit_list.front());
3892
4224
  } else {
3893
4225
  auto it = manifest_writers_.cbegin();
@@ -3975,7 +4307,8 @@ Status VersionSet::ProcessManifestWrites(
3975
4307
  } else if (group_start != std::numeric_limits<size_t>::max()) {
3976
4308
  group_start = std::numeric_limits<size_t>::max();
3977
4309
  }
3978
- Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu);
4310
+ Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
4311
+ &max_last_sequence, mu);
3979
4312
  if (!s.ok()) {
3980
4313
  // free up the allocated memory
3981
4314
  for (auto v : versions) {
@@ -4076,10 +4409,11 @@ Status VersionSet::ProcessManifestWrites(
4076
4409
  uint64_t new_manifest_file_size = 0;
4077
4410
  Status s;
4078
4411
  IOStatus io_s;
4412
+ IOStatus manifest_io_status;
4079
4413
  {
4080
4414
  FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
4081
4415
  mu->Unlock();
4082
-
4416
+ TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
4083
4417
  TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
4084
4418
  if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
4085
4419
  for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
@@ -4092,7 +4426,7 @@ Status VersionSet::ProcessManifestWrites(
4092
4426
  cfd->internal_stats(), 1 /* max_threads */,
4093
4427
  true /* prefetch_index_and_filter_in_cache */,
4094
4428
  false /* is_initial_load */,
4095
- mutable_cf_options_ptrs[i]->prefix_extractor.get(),
4429
+ mutable_cf_options_ptrs[i]->prefix_extractor,
4096
4430
  MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]));
4097
4431
  if (!s.ok()) {
4098
4432
  if (db_options_->paranoid_checks) {
@@ -4117,23 +4451,28 @@ Status VersionSet::ProcessManifestWrites(
4117
4451
  if (io_s.ok()) {
4118
4452
  descriptor_file->SetPreallocationBlockSize(
4119
4453
  db_options_->manifest_preallocation_size);
4120
-
4454
+ FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
4121
4455
  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
4122
- std::move(descriptor_file), descriptor_fname, opt_file_opts, env_,
4123
- io_tracer_, nullptr, db_options_->listeners));
4456
+ std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
4457
+ io_tracer_, nullptr, db_options_->listeners, nullptr,
4458
+ tmp_set.Contains(FileType::kDescriptorFile),
4459
+ tmp_set.Contains(FileType::kDescriptorFile)));
4124
4460
  descriptor_log_.reset(
4125
4461
  new log::Writer(std::move(file_writer), 0, false));
4126
4462
  s = WriteCurrentStateToManifest(curr_state, wal_additions,
4127
4463
  descriptor_log_.get(), io_s);
4128
4464
  } else {
4465
+ manifest_io_status = io_s;
4129
4466
  s = io_s;
4130
4467
  }
4131
4468
  }
4132
4469
 
4133
4470
  if (s.ok()) {
4134
4471
  if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
4472
+ constexpr bool update_stats = true;
4473
+
4135
4474
  for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
4136
- versions[i]->PrepareApply(*mutable_cf_options_ptrs[i], true);
4475
+ versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats);
4137
4476
  }
4138
4477
  }
4139
4478
 
@@ -4148,8 +4487,8 @@ Status VersionSet::ProcessManifestWrites(
4148
4487
  e->DebugString(true));
4149
4488
  break;
4150
4489
  }
4151
- TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord",
4152
- rocksdb_kill_odds * REDUCE_ODDS2);
4490
+ TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord",
4491
+ REDUCE_ODDS2);
4153
4492
  #ifndef NDEBUG
4154
4493
  if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
4155
4494
  TEST_SYNC_POINT_CALLBACK(
@@ -4163,11 +4502,13 @@ Status VersionSet::ProcessManifestWrites(
4163
4502
  io_s = descriptor_log_->AddRecord(record);
4164
4503
  if (!io_s.ok()) {
4165
4504
  s = io_s;
4505
+ manifest_io_status = io_s;
4166
4506
  break;
4167
4507
  }
4168
4508
  }
4169
4509
  if (s.ok()) {
4170
- io_s = SyncManifest(env_, db_options_, descriptor_log_->file());
4510
+ io_s = SyncManifest(db_options_, descriptor_log_->file());
4511
+ manifest_io_status = io_s;
4171
4512
  TEST_SYNC_POINT_CALLBACK(
4172
4513
  "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
4173
4514
  }
@@ -4180,13 +4521,15 @@ Status VersionSet::ProcessManifestWrites(
4180
4521
 
4181
4522
  // If we just created a new descriptor file, install it by writing a
4182
4523
  // new CURRENT file that points to it.
4524
+ if (s.ok()) {
4525
+ assert(manifest_io_status.ok());
4526
+ }
4183
4527
  if (s.ok() && new_descriptor_log) {
4184
4528
  io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
4185
4529
  db_directory);
4186
4530
  if (!io_s.ok()) {
4187
4531
  s = io_s;
4188
4532
  }
4189
- TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest");
4190
4533
  }
4191
4534
 
4192
4535
  if (s.ok()) {
@@ -4239,9 +4582,11 @@ Status VersionSet::ProcessManifestWrites(
4239
4582
  if (first_writer.edit_list.front()->is_column_family_add_) {
4240
4583
  assert(batch_edits.size() == 1);
4241
4584
  assert(new_cf_options != nullptr);
4585
+ assert(max_last_sequence == descriptor_last_sequence_);
4242
4586
  CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
4243
4587
  } else if (first_writer.edit_list.front()->is_column_family_drop_) {
4244
4588
  assert(batch_edits.size() == 1);
4589
+ assert(max_last_sequence == descriptor_last_sequence_);
4245
4590
  first_writer.cfd->SetDropped();
4246
4591
  first_writer.cfd->UnrefAndTryDelete();
4247
4592
  } else {
@@ -4272,8 +4617,7 @@ Status VersionSet::ProcessManifestWrites(
4272
4617
  }
4273
4618
 
4274
4619
  if (last_min_log_number_to_keep != 0) {
4275
- // Should only be set in 2PC mode.
4276
- MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
4620
+ MarkMinLogNumberToKeep(last_min_log_number_to_keep);
4277
4621
  }
4278
4622
 
4279
4623
  for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
@@ -4281,6 +4625,8 @@ Status VersionSet::ProcessManifestWrites(
4281
4625
  AppendVersion(cfd, versions[i]);
4282
4626
  }
4283
4627
  }
4628
+ assert(max_last_sequence >= descriptor_last_sequence_);
4629
+ descriptor_last_sequence_ = max_last_sequence;
4284
4630
  manifest_file_number_ = pending_manifest_file_number_;
4285
4631
  manifest_file_size_ = new_manifest_file_size;
4286
4632
  prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
@@ -4295,11 +4641,41 @@ Status VersionSet::ProcessManifestWrites(
4295
4641
  for (auto v : versions) {
4296
4642
  delete v;
4297
4643
  }
4644
+ if (manifest_io_status.ok()) {
4645
+ manifest_file_number_ = pending_manifest_file_number_;
4646
+ manifest_file_size_ = new_manifest_file_size;
4647
+ }
4298
4648
  // If manifest append failed for whatever reason, the file could be
4299
4649
  // corrupted. So we need to force the next version update to start a
4300
4650
  // new manifest file.
4301
4651
  descriptor_log_.reset();
4302
- if (new_descriptor_log) {
4652
+ // If manifest operations failed, then we know the CURRENT file still
4653
+ // points to the original MANIFEST. Therefore, we can safely delete the
4654
+ // new MANIFEST.
4655
+ // If manifest operations succeeded, and we are here, then it is possible
4656
+ // that renaming tmp file to CURRENT failed.
4657
+ //
4658
+ // On local POSIX-compliant FS, the CURRENT must point to the original
4659
+ // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
4660
+ // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
4661
+ // process not to crash and continue using the db. Any future LogAndApply()
4662
+ // call will switch to a new MANIFEST and update CURRENT, still ignoring
4663
+ // this one.
4664
+ //
4665
+ // On non-local FS, it is
4666
+ // possible that the rename operation succeeded on the server (remote)
4667
+ // side, but the client somehow returns a non-ok status to RocksDB. Note
4668
+ // that this does not violate atomicity. Should we delete the new MANIFEST
4669
+ // successfully, a subsequent recovery attempt will likely see the CURRENT
4670
+ // pointing to the new MANIFEST, thus fail. We will not be able to open the
4671
+ // DB again. Therefore, if manifest operations succeed, we should keep the
4672
+ // the new MANIFEST. If the process proceeds, any future LogAndApply() call
4673
+ // will switch to a new MANIFEST and update CURRENT. If user tries to
4674
+ // re-open the DB,
4675
+ // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
4676
+ // b) CURRENT points to the original MANIFEST, and the original MANIFEST
4677
+ // also exists.
4678
+ if (new_descriptor_log && !manifest_io_status.ok()) {
4303
4679
  ROCKS_LOG_INFO(db_options_->info_log,
4304
4680
  "Deleting manifest %" PRIu64 " current manifest %" PRIu64
4305
4681
  "\n",
@@ -4317,6 +4693,23 @@ Status VersionSet::ProcessManifestWrites(
4317
4693
 
4318
4694
  pending_manifest_file_number_ = 0;
4319
4695
 
4696
+ #ifndef NDEBUG
4697
+ // This is here kind of awkwardly because there's no other consistency
4698
+ // checks on `VersionSet`'s updates for the new `Version`s. We might want
4699
+ // to move it to a dedicated function, or remove it if we gain enough
4700
+ // confidence in `descriptor_last_sequence_`.
4701
+ if (s.ok()) {
4702
+ for (const auto* v : versions) {
4703
+ const auto* vstorage = v->storage_info();
4704
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
4705
+ for (const auto& file : vstorage->LevelFiles(level)) {
4706
+ assert(file->fd.largest_seqno <= descriptor_last_sequence_);
4707
+ }
4708
+ }
4709
+ }
4710
+ }
4711
+ #endif // NDEBUG
4712
+
4320
4713
  // wake up all the waiting writers
4321
4714
  while (true) {
4322
4715
  ManifestWriter* ready = manifest_writers_.front();
@@ -4346,7 +4739,15 @@ Status VersionSet::ProcessManifestWrites(
4346
4739
  return s;
4347
4740
  }
4348
4741
 
4349
- // 'datas' is gramatically incorrect. We still use this notation to indicate
4742
+ void VersionSet::WakeUpWaitingManifestWriters() {
4743
+ // wake up all the waiting writers
4744
+ // Notify new head of manifest write queue.
4745
+ if (!manifest_writers_.empty()) {
4746
+ manifest_writers_.front()->cv.Signal();
4747
+ }
4748
+ }
4749
+
4750
+ // 'datas' is grammatically incorrect. We still use this notation to indicate
4350
4751
  // that this variable represents a collection of column_family_data.
4351
4752
  Status VersionSet::LogAndApply(
4352
4753
  const autovector<ColumnFamilyData*>& column_family_datas,
@@ -4432,16 +4833,13 @@ Status VersionSet::LogAndApply(
4432
4833
  new_cf_options);
4433
4834
  }
4434
4835
 
4435
- void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
4836
+ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
4837
+ SequenceNumber* max_last_sequence) {
4838
+ assert(max_last_sequence != nullptr);
4436
4839
  assert(edit->IsColumnFamilyManipulation());
4437
4840
  edit->SetNextFile(next_file_number_.load());
4438
- // The log might have data that is not visible to memtbale and hence have not
4439
- // updated the last_sequence_ yet. It is also possible that the log has is
4440
- // expecting some new data that is not written yet. Since LastSequence is an
4441
- // upper bound on the sequence, it is ok to record
4442
- // last_allocated_sequence_ as the last sequence.
4443
- edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
4444
- : last_sequence_);
4841
+ assert(!edit->HasLastSequence());
4842
+ edit->SetLastSequence(*max_last_sequence);
4445
4843
  if (edit->is_column_family_drop_) {
4446
4844
  // if we drop column family, we have to make sure to save max column family,
4447
4845
  // so that we don't reuse existing ID
@@ -4451,12 +4849,14 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
4451
4849
 
4452
4850
  Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
4453
4851
  VersionBuilder* builder, VersionEdit* edit,
4852
+ SequenceNumber* max_last_sequence,
4454
4853
  InstrumentedMutex* mu) {
4455
4854
  #ifdef NDEBUG
4456
4855
  (void)cfd;
4457
4856
  #endif
4458
4857
  mu->AssertHeld();
4459
4858
  assert(!edit->IsColumnFamilyManipulation());
4859
+ assert(max_last_sequence != nullptr);
4460
4860
 
4461
4861
  if (edit->has_log_number_) {
4462
4862
  assert(edit->log_number_ >= cfd->GetLogNumber());
@@ -4467,13 +4867,11 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
4467
4867
  edit->SetPrevLogNumber(prev_log_number_);
4468
4868
  }
4469
4869
  edit->SetNextFile(next_file_number_.load());
4470
- // The log might have data that is not visible to memtbale and hence have not
4471
- // updated the last_sequence_ yet. It is also possible that the log has is
4472
- // expecting some new data that is not written yet. Since LastSequence is an
4473
- // upper bound on the sequence, it is ok to record
4474
- // last_allocated_sequence_ as the last sequence.
4475
- edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
4476
- : last_sequence_);
4870
+ if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) {
4871
+ *max_last_sequence = edit->GetLastSequence();
4872
+ } else {
4873
+ edit->SetLastSequence(*max_last_sequence);
4874
+ }
4477
4875
 
4478
4876
  // The builder can be nullptr only if edit is WAL manipulation,
4479
4877
  // because WAL edits do not need to be applied to versions,
@@ -4482,171 +4880,13 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
4482
4880
  return builder ? builder->Apply(edit) : Status::OK();
4483
4881
  }
4484
4882
 
4485
- Status VersionSet::ApplyOneVersionEditToBuilder(
4486
- VersionEdit& edit,
4487
- const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
4488
- std::unordered_map<int, std::string>& column_families_not_found,
4489
- std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
4490
- builders,
4491
- VersionEditParams* version_edit_params) {
4492
- // Not found means that user didn't supply that column
4493
- // family option AND we encountered column family add
4494
- // record. Once we encounter column family drop record,
4495
- // we will delete the column family from
4496
- // column_families_not_found.
4497
- bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) !=
4498
- column_families_not_found.end());
4499
- // in builders means that user supplied that column family
4500
- // option AND that we encountered column family add record
4501
- bool cf_in_builders = builders.find(edit.column_family_) != builders.end();
4502
-
4503
- // they can't both be true
4504
- assert(!(cf_in_not_found && cf_in_builders));
4505
-
4506
- ColumnFamilyData* cfd = nullptr;
4507
-
4508
- if (edit.is_column_family_add_) {
4509
- if (cf_in_builders || cf_in_not_found) {
4510
- return Status::Corruption(
4511
- "Manifest adding the same column family twice: " +
4512
- edit.column_family_name_);
4513
- }
4514
- auto cf_options = name_to_options.find(edit.column_family_name_);
4515
- // implicitly add persistent_stats column family without requiring user
4516
- // to specify
4517
- bool is_persistent_stats_column_family =
4518
- edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
4519
- if (cf_options == name_to_options.end() &&
4520
- !is_persistent_stats_column_family) {
4521
- column_families_not_found.insert(
4522
- {edit.column_family_, edit.column_family_name_});
4523
- } else {
4524
- // recover persistent_stats CF from a DB that already contains it
4525
- if (is_persistent_stats_column_family) {
4526
- ColumnFamilyOptions cfo;
4527
- OptimizeForPersistentStats(&cfo);
4528
- cfd = CreateColumnFamily(cfo, &edit);
4529
- } else {
4530
- cfd = CreateColumnFamily(cf_options->second, &edit);
4531
- }
4532
- cfd->set_initialized();
4533
- builders.insert(std::make_pair(
4534
- edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
4535
- new BaseReferencedVersionBuilder(cfd))));
4536
- }
4537
- } else if (edit.is_column_family_drop_) {
4538
- if (cf_in_builders) {
4539
- auto builder = builders.find(edit.column_family_);
4540
- assert(builder != builders.end());
4541
- builders.erase(builder);
4542
- cfd = column_family_set_->GetColumnFamily(edit.column_family_);
4543
- assert(cfd != nullptr);
4544
- if (cfd->UnrefAndTryDelete()) {
4545
- cfd = nullptr;
4546
- } else {
4547
- // who else can have reference to cfd!?
4548
- assert(false);
4549
- }
4550
- } else if (cf_in_not_found) {
4551
- column_families_not_found.erase(edit.column_family_);
4552
- } else {
4553
- return Status::Corruption(
4554
- "Manifest - dropping non-existing column family");
4555
- }
4556
- } else if (edit.IsWalAddition()) {
4557
- Status s = wals_.AddWals(edit.GetWalAdditions());
4558
- if (!s.ok()) {
4559
- return s;
4560
- }
4561
- } else if (edit.IsWalDeletion()) {
4562
- Status s = wals_.DeleteWalsBefore(edit.GetWalDeletion().GetLogNumber());
4563
- if (!s.ok()) {
4564
- return s;
4565
- }
4566
- } else if (!cf_in_not_found) {
4567
- if (!cf_in_builders) {
4568
- return Status::Corruption(
4569
- "Manifest record referencing unknown column family");
4570
- }
4571
-
4572
- cfd = column_family_set_->GetColumnFamily(edit.column_family_);
4573
- // this should never happen since cf_in_builders is true
4574
- assert(cfd != nullptr);
4575
-
4576
- // if it is not column family add or column family drop,
4577
- // then it's a file add/delete, which should be forwarded
4578
- // to builder
4579
- auto builder = builders.find(edit.column_family_);
4580
- assert(builder != builders.end());
4581
- Status s = builder->second->version_builder()->Apply(&edit);
4582
- if (!s.ok()) {
4583
- return s;
4584
- }
4585
- }
4586
- return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params);
4587
- }
4588
-
4589
- Status VersionSet::ExtractInfoFromVersionEdit(
4590
- ColumnFamilyData* cfd, const VersionEdit& from_edit,
4591
- VersionEditParams* version_edit_params) {
4592
- if (cfd != nullptr) {
4593
- if (from_edit.has_db_id_) {
4594
- version_edit_params->SetDBId(from_edit.db_id_);
4595
- }
4596
- if (from_edit.has_log_number_) {
4597
- if (cfd->GetLogNumber() > from_edit.log_number_) {
4598
- ROCKS_LOG_WARN(
4599
- db_options_->info_log,
4600
- "MANIFEST corruption detected, but ignored - Log numbers in "
4601
- "records NOT monotonically increasing");
4602
- } else {
4603
- cfd->SetLogNumber(from_edit.log_number_);
4604
- version_edit_params->SetLogNumber(from_edit.log_number_);
4605
- }
4606
- }
4607
- if (from_edit.has_comparator_ &&
4608
- from_edit.comparator_ != cfd->user_comparator()->Name()) {
4609
- return Status::InvalidArgument(
4610
- cfd->user_comparator()->Name(),
4611
- "does not match existing comparator " + from_edit.comparator_);
4612
- }
4613
- if (from_edit.HasFullHistoryTsLow()) {
4614
- const std::string& new_ts = from_edit.GetFullHistoryTsLow();
4615
- cfd->SetFullHistoryTsLow(new_ts);
4616
- }
4617
- }
4618
-
4619
- if (from_edit.has_prev_log_number_) {
4620
- version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
4621
- }
4622
-
4623
- if (from_edit.has_next_file_number_) {
4624
- version_edit_params->SetNextFile(from_edit.next_file_number_);
4625
- }
4626
-
4627
- if (from_edit.has_max_column_family_) {
4628
- version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
4629
- }
4630
-
4631
- if (from_edit.has_min_log_number_to_keep_) {
4632
- version_edit_params->min_log_number_to_keep_ =
4633
- std::max(version_edit_params->min_log_number_to_keep_,
4634
- from_edit.min_log_number_to_keep_);
4635
- }
4636
-
4637
- if (from_edit.has_last_sequence_) {
4638
- version_edit_params->SetLastSequence(from_edit.last_sequence_);
4639
- }
4640
- return Status::OK();
4641
- }
4642
-
4643
- Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
4644
- FileSystem* fs,
4645
- std::string* manifest_path,
4646
- uint64_t* manifest_file_number) {
4647
- assert(fs != nullptr);
4648
- assert(manifest_path != nullptr);
4649
- assert(manifest_file_number != nullptr);
4883
+ Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
4884
+ FileSystem* fs,
4885
+ std::string* manifest_path,
4886
+ uint64_t* manifest_file_number) {
4887
+ assert(fs != nullptr);
4888
+ assert(manifest_path != nullptr);
4889
+ assert(manifest_file_number != nullptr);
4650
4890
 
4651
4891
  std::string fname;
4652
4892
  Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname);
@@ -4671,77 +4911,6 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
4671
4911
  return Status::OK();
4672
4912
  }
4673
4913
 
4674
- Status VersionSet::ReadAndRecover(
4675
- log::Reader& reader, AtomicGroupReadBuffer* read_buffer,
4676
- const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
4677
- std::unordered_map<int, std::string>& column_families_not_found,
4678
- std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
4679
- builders,
4680
- Status* log_read_status, VersionEditParams* version_edit_params,
4681
- std::string* db_id) {
4682
- assert(read_buffer != nullptr);
4683
- assert(log_read_status != nullptr);
4684
- Status s;
4685
- Slice record;
4686
- std::string scratch;
4687
- size_t recovered_edits = 0;
4688
- while (s.ok() && reader.ReadRecord(&record, &scratch) &&
4689
- log_read_status->ok()) {
4690
- VersionEdit edit;
4691
- s = edit.DecodeFrom(record);
4692
- if (!s.ok()) {
4693
- break;
4694
- }
4695
- if (edit.has_db_id_) {
4696
- db_id_ = edit.GetDbId();
4697
- if (db_id != nullptr) {
4698
- db_id->assign(edit.GetDbId());
4699
- }
4700
- }
4701
- s = read_buffer->AddEdit(&edit);
4702
- if (!s.ok()) {
4703
- break;
4704
- }
4705
- if (edit.is_in_atomic_group_) {
4706
- if (read_buffer->IsFull()) {
4707
- // Apply edits in an atomic group when we have read all edits in the
4708
- // group.
4709
- for (auto& e : read_buffer->replay_buffer()) {
4710
- s = ApplyOneVersionEditToBuilder(e, name_to_options,
4711
- column_families_not_found, builders,
4712
- version_edit_params);
4713
- if (!s.ok()) {
4714
- break;
4715
- }
4716
- recovered_edits++;
4717
- }
4718
- if (!s.ok()) {
4719
- break;
4720
- }
4721
- read_buffer->Clear();
4722
- }
4723
- } else {
4724
- // Apply a normal edit immediately.
4725
- s = ApplyOneVersionEditToBuilder(edit, name_to_options,
4726
- column_families_not_found, builders,
4727
- version_edit_params);
4728
- if (s.ok()) {
4729
- recovered_edits++;
4730
- }
4731
- }
4732
- }
4733
- if (!log_read_status->ok()) {
4734
- s = *log_read_status;
4735
- }
4736
- if (!s.ok()) {
4737
- // Clear the buffer if we fail to decode/apply an edit.
4738
- read_buffer->Clear();
4739
- }
4740
- TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits",
4741
- &recovered_edits);
4742
- return s;
4743
- }
4744
-
4745
4914
  Status VersionSet::Recover(
4746
4915
  const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
4747
4916
  std::string* db_id) {
@@ -4765,9 +4934,9 @@ Status VersionSet::Recover(
4765
4934
  if (!s.ok()) {
4766
4935
  return s;
4767
4936
  }
4768
- manifest_file_reader.reset(
4769
- new SequentialFileReader(std::move(manifest_file), manifest_path,
4770
- db_options_->log_readahead_size, io_tracer_));
4937
+ manifest_file_reader.reset(new SequentialFileReader(
4938
+ std::move(manifest_file), manifest_path,
4939
+ db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
4771
4940
  }
4772
4941
  uint64_t current_manifest_file_size = 0;
4773
4942
  uint64_t log_number = 0;
@@ -4777,10 +4946,10 @@ Status VersionSet::Recover(
4777
4946
  reporter.status = &log_read_status;
4778
4947
  log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
4779
4948
  true /* checksum */, 0 /* log_number */);
4780
- VersionEditHandler handler(
4781
- read_only, column_families, const_cast<VersionSet*>(this),
4782
- /*track_missing_files=*/false,
4783
- /*no_error_if_table_files_missing=*/false, io_tracer_);
4949
+ VersionEditHandler handler(read_only, column_families,
4950
+ const_cast<VersionSet*>(this),
4951
+ /*track_missing_files=*/false,
4952
+ /*no_error_if_files_missing=*/false, io_tracer_);
4784
4953
  handler.Iterate(reader, &log_read_status);
4785
4954
  s = handler.status();
4786
4955
  if (s.ok()) {
@@ -4802,7 +4971,7 @@ Status VersionSet::Recover(
4802
4971
  ",min_log_number_to_keep is %" PRIu64 "\n",
4803
4972
  manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
4804
4973
  last_sequence_.load(), log_number, prev_log_number_,
4805
- column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
4974
+ column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
4806
4975
 
4807
4976
  for (auto cfd : *column_family_set_) {
4808
4977
  if (cfd->IsDropped()) {
@@ -4937,9 +5106,9 @@ Status VersionSet::TryRecoverFromOneManifest(
4937
5106
  if (!s.ok()) {
4938
5107
  return s;
4939
5108
  }
4940
- manifest_file_reader.reset(
4941
- new SequentialFileReader(std::move(manifest_file), manifest_path,
4942
- db_options_->log_readahead_size, io_tracer_));
5109
+ manifest_file_reader.reset(new SequentialFileReader(
5110
+ std::move(manifest_file), manifest_path,
5111
+ db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
4943
5112
  }
4944
5113
 
4945
5114
  assert(s.ok());
@@ -4963,9 +5132,6 @@ Status VersionSet::TryRecoverFromOneManifest(
4963
5132
  Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
4964
5133
  const std::string& dbname,
4965
5134
  FileSystem* fs) {
4966
- // these are just for performance reasons, not correcntes,
4967
- // so we're fine using the defaults
4968
- FileOptions soptions;
4969
5135
  // Read "CURRENT" file, which contains a pointer to the current manifest file
4970
5136
  std::string manifest_path;
4971
5137
  uint64_t manifest_file_number;
@@ -4974,16 +5140,24 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
4974
5140
  if (!s.ok()) {
4975
5141
  return s;
4976
5142
  }
5143
+ return ListColumnFamiliesFromManifest(manifest_path, fs, column_families);
5144
+ }
4977
5145
 
5146
+ Status VersionSet::ListColumnFamiliesFromManifest(
5147
+ const std::string& manifest_path, FileSystem* fs,
5148
+ std::vector<std::string>* column_families) {
4978
5149
  std::unique_ptr<SequentialFileReader> file_reader;
5150
+ Status s;
4979
5151
  {
4980
5152
  std::unique_ptr<FSSequentialFile> file;
4981
- s = fs->NewSequentialFile(manifest_path, soptions, &file, nullptr);
5153
+ // these are just for performance reasons, not correctness,
5154
+ // so we're fine using the defaults
5155
+ s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr);
4982
5156
  if (!s.ok()) {
4983
5157
  return s;
4984
- }
4985
- file_reader.reset(new SequentialFileReader(std::move(file), manifest_path,
4986
- nullptr /*IOTracer*/));
5158
+ }
5159
+ file_reader = std::make_unique<SequentialFileReader>(
5160
+ std::move(file), manifest_path, /*io_tracer=*/nullptr);
4987
5161
  }
4988
5162
 
4989
5163
  VersionSet::LogReporter reporter;
@@ -5022,7 +5196,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
5022
5196
  WriteController wc(options->delayed_write_rate);
5023
5197
  WriteBufferManager wb(options->db_write_buffer_size);
5024
5198
  VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
5025
- nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/);
5199
+ nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
5200
+ /*db_session_id*/ "");
5026
5201
  Status status;
5027
5202
 
5028
5203
  std::vector<ColumnFamilyDescriptor> dummy;
@@ -5104,7 +5279,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
5104
5279
  }
5105
5280
 
5106
5281
  // Get the checksum information including the checksum and checksum function
5107
- // name of all SST files in VersionSet. Store the information in
5282
+ // name of all SST and blob files in VersionSet. Store the information in
5108
5283
  // FileChecksumList which contains a map from file number to its checksum info.
5109
5284
  // If DB is not running, make sure call VersionSet::Recover() to load the file
5110
5285
  // metadata from Manifest to VersionSet before calling this function.
@@ -5118,35 +5293,70 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
5118
5293
  checksum_list->reset();
5119
5294
 
5120
5295
  for (auto cfd : *column_family_set_) {
5296
+ assert(cfd);
5297
+
5121
5298
  if (cfd->IsDropped() || !cfd->initialized()) {
5122
5299
  continue;
5123
5300
  }
5301
+
5302
+ const auto* current = cfd->current();
5303
+ assert(current);
5304
+
5305
+ const auto* vstorage = current->storage_info();
5306
+ assert(vstorage);
5307
+
5308
+ /* SST files */
5124
5309
  for (int level = 0; level < cfd->NumberLevels(); level++) {
5125
- for (const auto& file :
5126
- cfd->current()->storage_info()->LevelFiles(level)) {
5310
+ const auto& level_files = vstorage->LevelFiles(level);
5311
+
5312
+ for (const auto& file : level_files) {
5313
+ assert(file);
5314
+
5127
5315
  s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
5128
5316
  file->file_checksum,
5129
5317
  file->file_checksum_func_name);
5130
5318
  if (!s.ok()) {
5131
- break;
5319
+ return s;
5132
5320
  }
5133
5321
  }
5322
+ }
5323
+
5324
+ /* Blob files */
5325
+ const auto& blob_files = vstorage->GetBlobFiles();
5326
+ for (const auto& meta : blob_files) {
5327
+ assert(meta);
5328
+
5329
+ std::string checksum_value = meta->GetChecksumValue();
5330
+ std::string checksum_method = meta->GetChecksumMethod();
5331
+ assert(checksum_value.empty() == checksum_method.empty());
5332
+ if (meta->GetChecksumMethod().empty()) {
5333
+ checksum_value = kUnknownFileChecksum;
5334
+ checksum_method = kUnknownFileChecksumFuncName;
5335
+ }
5336
+
5337
+ s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(),
5338
+ checksum_value, checksum_method);
5134
5339
  if (!s.ok()) {
5135
- break;
5340
+ return s;
5136
5341
  }
5137
5342
  }
5138
- if (!s.ok()) {
5139
- break;
5140
- }
5141
5343
  }
5344
+
5142
5345
  return s;
5143
5346
  }
5144
5347
 
5145
5348
  Status VersionSet::DumpManifest(Options& options, std::string& dscname,
5146
5349
  bool verbose, bool hex, bool json) {
5350
+ assert(options.env);
5351
+ std::vector<std::string> column_families;
5352
+ Status s = ListColumnFamiliesFromManifest(
5353
+ dscname, options.env->GetFileSystem().get(), &column_families);
5354
+ if (!s.ok()) {
5355
+ return s;
5356
+ }
5357
+
5147
5358
  // Open the specified manifest file.
5148
5359
  std::unique_ptr<SequentialFileReader> file_reader;
5149
- Status s;
5150
5360
  {
5151
5361
  std::unique_ptr<FSSequentialFile> file;
5152
5362
  const std::shared_ptr<FileSystem>& fs = options.env->GetFileSystem();
@@ -5157,14 +5367,16 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
5157
5367
  if (!s.ok()) {
5158
5368
  return s;
5159
5369
  }
5160
- file_reader.reset(new SequentialFileReader(
5161
- std::move(file), dscname, db_options_->log_readahead_size, io_tracer_));
5370
+ file_reader = std::make_unique<SequentialFileReader>(
5371
+ std::move(file), dscname, db_options_->log_readahead_size, io_tracer_);
5162
5372
  }
5163
5373
 
5164
- std::vector<ColumnFamilyDescriptor> column_families(
5165
- 1, ColumnFamilyDescriptor(kDefaultColumnFamilyName, options));
5166
- DumpManifestHandler handler(column_families, this, io_tracer_, verbose, hex,
5167
- json);
5374
+ std::vector<ColumnFamilyDescriptor> cf_descs;
5375
+ for (const auto& cf : column_families) {
5376
+ cf_descs.emplace_back(cf, options);
5377
+ }
5378
+
5379
+ DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json);
5168
5380
  {
5169
5381
  VersionSet::LogReporter reporter;
5170
5382
  reporter.status = &s;
@@ -5186,9 +5398,9 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
5186
5398
  }
5187
5399
  // Called only either from ::LogAndApply which is protected by mutex or during
5188
5400
  // recovery which is single-threaded.
5189
- void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
5190
- if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) {
5191
- min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed);
5401
+ void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
5402
+ if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
5403
+ min_log_number_to_keep_.store(number, std::memory_order_relaxed);
5192
5404
  }
5193
5405
  }
5194
5406
 
@@ -5268,28 +5480,33 @@ Status VersionSet::WriteCurrentStateToManifest(
5268
5480
  VersionEdit edit;
5269
5481
  edit.SetColumnFamily(cfd->GetID());
5270
5482
 
5271
- assert(cfd->current());
5272
- assert(cfd->current()->storage_info());
5483
+ const auto* current = cfd->current();
5484
+ assert(current);
5485
+
5486
+ const auto* vstorage = current->storage_info();
5487
+ assert(vstorage);
5273
5488
 
5274
5489
  for (int level = 0; level < cfd->NumberLevels(); level++) {
5275
- for (const auto& f :
5276
- cfd->current()->storage_info()->LevelFiles(level)) {
5277
- edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
5278
- f->fd.GetFileSize(), f->smallest, f->largest,
5279
- f->fd.smallest_seqno, f->fd.largest_seqno,
5280
- f->marked_for_compaction, f->oldest_blob_file_number,
5281
- f->oldest_ancester_time, f->file_creation_time,
5282
- f->file_checksum, f->file_checksum_func_name);
5490
+ const auto& level_files = vstorage->LevelFiles(level);
5491
+
5492
+ for (const auto& f : level_files) {
5493
+ assert(f);
5494
+
5495
+ edit.AddFile(
5496
+ level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
5497
+ f->smallest, f->largest, f->fd.smallest_seqno,
5498
+ f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
5499
+ f->oldest_blob_file_number, f->oldest_ancester_time,
5500
+ f->file_creation_time, f->file_checksum,
5501
+ f->file_checksum_func_name, f->min_timestamp, f->max_timestamp);
5283
5502
  }
5284
5503
  }
5285
5504
 
5286
- const auto& blob_files = cfd->current()->storage_info()->GetBlobFiles();
5287
- for (const auto& pair : blob_files) {
5288
- const uint64_t blob_file_number = pair.first;
5289
- const auto& meta = pair.second;
5290
-
5505
+ const auto& blob_files = vstorage->GetBlobFiles();
5506
+ for (const auto& meta : blob_files) {
5291
5507
  assert(meta);
5292
- assert(blob_file_number == meta->GetBlobFileNumber());
5508
+
5509
+ const uint64_t blob_file_number = meta->GetBlobFileNumber();
5293
5510
 
5294
5511
  edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(),
5295
5512
  meta->GetTotalBlobBytes(), meta->GetChecksumMethod(),
@@ -5309,7 +5526,7 @@ Status VersionSet::WriteCurrentStateToManifest(
5309
5526
  // min_log_number_to_keep is for the whole db, not for specific column family.
5310
5527
  // So it does not need to be set for every column family, just need to be set once.
5311
5528
  // Since default CF can never be dropped, we set the min_log to the default CF here.
5312
- uint64_t min_log = min_log_number_to_keep_2pc();
5529
+ uint64_t min_log = min_log_number_to_keep();
5313
5530
  if (min_log != 0) {
5314
5531
  edit.SetMinLogNumberToKeep(min_log);
5315
5532
  }
@@ -5319,6 +5536,9 @@ Status VersionSet::WriteCurrentStateToManifest(
5319
5536
  if (!full_history_ts_low.empty()) {
5320
5537
  edit.SetFullHistoryTsLow(full_history_ts_low);
5321
5538
  }
5539
+
5540
+ edit.SetLastSequence(descriptor_last_sequence_);
5541
+
5322
5542
  std::string record;
5323
5543
  if (!edit.EncodeTo(&record)) {
5324
5544
  return Status::Corruption(
@@ -5489,7 +5709,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
5489
5709
  if (table_cache != nullptr) {
5490
5710
  result = table_cache->ApproximateOffsetOf(
5491
5711
  key, f.file_metadata->fd, caller, icmp,
5492
- v->GetMutableCFOptions().prefix_extractor.get());
5712
+ v->GetMutableCFOptions().prefix_extractor);
5493
5713
  }
5494
5714
  }
5495
5715
  return result;
@@ -5529,7 +5749,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
5529
5749
  }
5530
5750
  return table_cache->ApproximateSize(
5531
5751
  start, end, f.file_metadata->fd, caller, icmp,
5532
- v->GetMutableCFOptions().prefix_extractor.get());
5752
+ v->GetMutableCFOptions().prefix_extractor);
5533
5753
  }
5534
5754
 
5535
5755
  void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
@@ -5603,7 +5823,9 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
5603
5823
  InternalIterator* VersionSet::MakeInputIterator(
5604
5824
  const ReadOptions& read_options, const Compaction* c,
5605
5825
  RangeDelAggregator* range_del_agg,
5606
- const FileOptions& file_options_compactions) {
5826
+ const FileOptions& file_options_compactions,
5827
+ const std::optional<const Slice>& start,
5828
+ const std::optional<const Slice>& end) {
5607
5829
  auto cfd = c->column_family_data();
5608
5830
  // Level-0 files have to be merged together. For other levels,
5609
5831
  // we will make a concatenating iterator per level.
@@ -5618,10 +5840,25 @@ InternalIterator* VersionSet::MakeInputIterator(
5618
5840
  if (c->level(which) == 0) {
5619
5841
  const LevelFilesBrief* flevel = c->input_levels(which);
5620
5842
  for (size_t i = 0; i < flevel->num_files; i++) {
5843
+ const FileMetaData& fmd = *flevel->files[i].file_metadata;
5844
+ if (start.has_value() &&
5845
+ cfd->user_comparator()->Compare(start.value(),
5846
+ fmd.largest.user_key()) > 0) {
5847
+ continue;
5848
+ }
5849
+ // We should be able to filter out the case where the end key
5850
+ // equals to the end boundary, since the end key is exclusive.
5851
+ // We try to be extra safe here.
5852
+ if (end.has_value() &&
5853
+ cfd->user_comparator()->Compare(end.value(),
5854
+ fmd.smallest.user_key()) < 0) {
5855
+ continue;
5856
+ }
5857
+
5621
5858
  list[num++] = cfd->table_cache()->NewIterator(
5622
5859
  read_options, file_options_compactions,
5623
- cfd->internal_comparator(), *flevel->files[i].file_metadata,
5624
- range_del_agg, c->mutable_cf_options()->prefix_extractor.get(),
5860
+ cfd->internal_comparator(), fmd, range_del_agg,
5861
+ c->mutable_cf_options()->prefix_extractor,
5625
5862
  /*table_reader_ptr=*/nullptr,
5626
5863
  /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
5627
5864
  /*arena=*/nullptr,
@@ -5637,7 +5874,7 @@ InternalIterator* VersionSet::MakeInputIterator(
5637
5874
  list[num++] = new LevelIterator(
5638
5875
  cfd->table_cache(), read_options, file_options_compactions,
5639
5876
  cfd->internal_comparator(), c->input_levels(which),
5640
- c->mutable_cf_options()->prefix_extractor.get(),
5877
+ c->mutable_cf_options()->prefix_extractor,
5641
5878
  /*should_sample=*/false,
5642
5879
  /*no per level latency histogram=*/nullptr,
5643
5880
  TableReaderCaller::kCompaction, /*skip_filters=*/false,
@@ -5654,57 +5891,6 @@ InternalIterator* VersionSet::MakeInputIterator(
5654
5891
  return result;
5655
5892
  }
5656
5893
 
5657
- // verify that the files listed in this compaction are present
5658
- // in the current version
5659
- bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
5660
- #ifndef NDEBUG
5661
- Version* version = c->column_family_data()->current();
5662
- const VersionStorageInfo* vstorage = version->storage_info();
5663
- if (c->input_version() != version) {
5664
- ROCKS_LOG_INFO(
5665
- db_options_->info_log,
5666
- "[%s] compaction output being applied to a different base version from"
5667
- " input version",
5668
- c->column_family_data()->GetName().c_str());
5669
-
5670
- if (vstorage->compaction_style_ == kCompactionStyleLevel &&
5671
- c->start_level() == 0 && c->num_input_levels() > 2U) {
5672
- // We are doing a L0->base_level compaction. The assumption is if
5673
- // base level is not L1, levels from L1 to base_level - 1 is empty.
5674
- // This is ensured by having one compaction from L0 going on at the
5675
- // same time in level-based compaction. So that during the time, no
5676
- // compaction/flush can put files to those levels.
5677
- for (int l = c->start_level() + 1; l < c->output_level(); l++) {
5678
- if (vstorage->NumLevelFiles(l) != 0) {
5679
- return false;
5680
- }
5681
- }
5682
- }
5683
- }
5684
-
5685
- for (size_t input = 0; input < c->num_input_levels(); ++input) {
5686
- int level = c->level(input);
5687
- for (size_t i = 0; i < c->num_input_files(input); ++i) {
5688
- uint64_t number = c->input(input, i)->fd.GetNumber();
5689
- bool found = false;
5690
- for (size_t j = 0; j < vstorage->files_[level].size(); j++) {
5691
- FileMetaData* f = vstorage->files_[level][j];
5692
- if (f->fd.GetNumber() == number) {
5693
- found = true;
5694
- break;
5695
- }
5696
- }
5697
- if (!found) {
5698
- return false; // input files non existent in current version
5699
- }
5700
- }
5701
- }
5702
- #else
5703
- (void)c;
5704
- #endif
5705
- return true; // everything good
5706
- }
5707
-
5708
5894
  Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
5709
5895
  FileMetaData** meta,
5710
5896
  ColumnFamilyData** cfd) {
@@ -5745,11 +5931,13 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
5745
5931
  assert(!cfd->ioptions()->cf_paths.empty());
5746
5932
  filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
5747
5933
  }
5934
+ filemetadata.directory = filemetadata.db_path;
5748
5935
  const uint64_t file_number = file->fd.GetNumber();
5749
5936
  filemetadata.name = MakeTableFileName("", file_number);
5937
+ filemetadata.relative_filename = filemetadata.name.substr(1);
5750
5938
  filemetadata.file_number = file_number;
5751
5939
  filemetadata.level = level;
5752
- filemetadata.size = static_cast<size_t>(file->fd.GetFileSize());
5940
+ filemetadata.size = file->fd.GetFileSize();
5753
5941
  filemetadata.smallestkey = file->smallest.user_key().ToString();
5754
5942
  filemetadata.largestkey = file->largest.user_key().ToString();
5755
5943
  filemetadata.smallest_seqno = file->fd.smallest_seqno;
@@ -5762,6 +5950,9 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
5762
5950
  filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
5763
5951
  filemetadata.file_checksum = file->file_checksum;
5764
5952
  filemetadata.file_checksum_func_name = file->file_checksum_func_name;
5953
+ filemetadata.temperature = file->temperature;
5954
+ filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
5955
+ filemetadata.file_creation_time = file->TryGetFileCreationTime();
5765
5956
  metadata->push_back(filemetadata);
5766
5957
  }
5767
5958
  }
@@ -5820,9 +6011,10 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
5820
6011
  *new_cfd->GetLatestMutableCFOptions(), io_tracer_,
5821
6012
  current_version_number_++);
5822
6013
 
5823
- // Fill level target base information.
5824
- v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(),
5825
- *new_cfd->GetLatestMutableCFOptions());
6014
+ constexpr bool update_stats = false;
6015
+
6016
+ v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats);
6017
+
5826
6018
  AppendVersion(new_cfd, v);
5827
6019
  // GetLatestMutableCFOptions() is safe here without mutex since the
5828
6020
  // cfd is not available to client
@@ -5858,6 +6050,34 @@ uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
5858
6050
  return total_files_size;
5859
6051
  }
5860
6052
 
6053
+ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
6054
+ std::unordered_set<uint64_t> unique_blob_files;
6055
+
6056
+ uint64_t all_versions_blob_file_size = 0;
6057
+
6058
+ for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
6059
+ // iterate all the versions
6060
+ const auto* vstorage = v->storage_info();
6061
+ assert(vstorage);
6062
+
6063
+ const auto& blob_files = vstorage->GetBlobFiles();
6064
+
6065
+ for (const auto& meta : blob_files) {
6066
+ assert(meta);
6067
+
6068
+ const uint64_t blob_file_number = meta->GetBlobFileNumber();
6069
+
6070
+ if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) {
6071
+ // find Blob file that has not been counted
6072
+ unique_blob_files.insert(blob_file_number);
6073
+ all_versions_blob_file_size += meta->GetBlobFileSize();
6074
+ }
6075
+ }
6076
+ }
6077
+
6078
+ return all_versions_blob_file_size;
6079
+ }
6080
+
5861
6081
  Status VersionSet::VerifyFileMetadata(const std::string& fpath,
5862
6082
  const FileMetaData& meta) const {
5863
6083
  uint64_t fsize = 0;
@@ -5877,8 +6097,8 @@ ReactiveVersionSet::ReactiveVersionSet(
5877
6097
  const std::shared_ptr<IOTracer>& io_tracer)
5878
6098
  : VersionSet(dbname, _db_options, _file_options, table_cache,
5879
6099
  write_buffer_manager, write_controller,
5880
- /*block_cache_tracer=*/nullptr, io_tracer),
5881
- number_of_edits_to_skip_(0) {}
6100
+ /*block_cache_tracer=*/nullptr, io_tracer,
6101
+ /*db_session_id*/ "") {}
5882
6102
 
5883
6103
  ReactiveVersionSet::~ReactiveVersionSet() {}
5884
6104
 
@@ -5891,443 +6111,124 @@ Status ReactiveVersionSet::Recover(
5891
6111
  assert(manifest_reporter != nullptr);
5892
6112
  assert(manifest_reader_status != nullptr);
5893
6113
 
5894
- std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
5895
- for (const auto& cf : column_families) {
5896
- cf_name_to_options.insert({cf.name, cf.options});
5897
- }
5898
-
5899
- // add default column family
5900
- auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
5901
- if (default_cf_iter == cf_name_to_options.end()) {
5902
- return Status::InvalidArgument("Default column family not specified");
5903
- }
5904
- VersionEdit default_cf_edit;
5905
- default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
5906
- default_cf_edit.SetColumnFamily(0);
5907
- ColumnFamilyData* default_cfd =
5908
- CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
5909
- // In recovery, nobody else can access it, so it's fine to set it to be
5910
- // initialized earlier.
5911
- default_cfd->set_initialized();
5912
- VersionBuilderMap builders;
5913
- std::unordered_map<int, std::string> column_families_not_found;
5914
- builders.insert(
5915
- std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
5916
- new BaseReferencedVersionBuilder(default_cfd))));
5917
-
5918
6114
  manifest_reader_status->reset(new Status());
5919
6115
  manifest_reporter->reset(new LogReporter());
5920
6116
  static_cast_with_check<LogReporter>(manifest_reporter->get())->status =
5921
6117
  manifest_reader_status->get();
5922
6118
  Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
5923
- log::Reader* reader = manifest_reader->get();
5924
-
5925
- int retry = 0;
5926
- VersionEdit version_edit;
5927
- while (s.ok() && retry < 1) {
5928
- assert(reader != nullptr);
5929
- s = ReadAndRecover(*reader, &read_buffer_, cf_name_to_options,
5930
- column_families_not_found, builders,
5931
- manifest_reader_status->get(), &version_edit);
5932
- if (s.ok()) {
5933
- bool enough = version_edit.has_next_file_number_ &&
5934
- version_edit.has_log_number_ &&
5935
- version_edit.has_last_sequence_;
5936
- if (enough) {
5937
- for (const auto& cf : column_families) {
5938
- auto cfd = column_family_set_->GetColumnFamily(cf.name);
5939
- if (cfd == nullptr) {
5940
- enough = false;
5941
- break;
5942
- }
5943
- }
5944
- }
5945
- if (enough) {
5946
- for (const auto& cf : column_families) {
5947
- auto cfd = column_family_set_->GetColumnFamily(cf.name);
5948
- assert(cfd != nullptr);
5949
- if (!cfd->IsDropped()) {
5950
- auto builder_iter = builders.find(cfd->GetID());
5951
- assert(builder_iter != builders.end());
5952
- auto builder = builder_iter->second->version_builder();
5953
- assert(builder != nullptr);
5954
- s = builder->LoadTableHandlers(
5955
- cfd->internal_stats(), db_options_->max_file_opening_threads,
5956
- false /* prefetch_index_and_filter_in_cache */,
5957
- true /* is_initial_load */,
5958
- cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
5959
- MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
5960
- if (!s.ok()) {
5961
- enough = false;
5962
- if (s.IsPathNotFound()) {
5963
- s = Status::OK();
5964
- }
5965
- break;
5966
- }
5967
- }
5968
- }
5969
- }
5970
- if (enough) {
5971
- break;
5972
- }
5973
- }
5974
- ++retry;
5975
- }
5976
-
5977
- if (s.ok()) {
5978
- if (!version_edit.has_prev_log_number_) {
5979
- version_edit.prev_log_number_ = 0;
5980
- }
5981
- column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_);
5982
-
5983
- MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_);
5984
- MarkFileNumberUsed(version_edit.prev_log_number_);
5985
- MarkFileNumberUsed(version_edit.log_number_);
5986
-
5987
- for (auto cfd : *column_family_set_) {
5988
- assert(builders.count(cfd->GetID()) > 0);
5989
- auto builder = builders[cfd->GetID()]->version_builder();
5990
- if (!builder->CheckConsistencyForNumLevels()) {
5991
- s = Status::InvalidArgument(
5992
- "db has more levels than options.num_levels");
5993
- break;
5994
- }
5995
- }
6119
+ if (!s.ok()) {
6120
+ return s;
5996
6121
  }
6122
+ log::Reader* reader = manifest_reader->get();
6123
+ assert(reader);
5997
6124
 
5998
- if (s.ok()) {
5999
- for (auto cfd : *column_family_set_) {
6000
- if (cfd->IsDropped()) {
6001
- continue;
6002
- }
6003
- assert(cfd->initialized());
6004
- auto builders_iter = builders.find(cfd->GetID());
6005
- assert(builders_iter != builders.end());
6006
- auto* builder = builders_iter->second->version_builder();
6125
+ manifest_tailer_.reset(new ManifestTailer(
6126
+ column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
6007
6127
 
6008
- Version* v = new Version(cfd, this, file_options_,
6009
- *cfd->GetLatestMutableCFOptions(), io_tracer_,
6010
- current_version_number_++);
6011
- s = builder->SaveTo(v->storage_info());
6128
+ manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
6012
6129
 
6013
- if (s.ok()) {
6014
- // Install recovered version
6015
- v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
6016
- !(db_options_->skip_stats_update_on_db_open));
6017
- AppendVersion(cfd, v);
6018
- } else {
6019
- ROCKS_LOG_ERROR(db_options_->info_log,
6020
- "[%s]: inconsistent version: %s\n",
6021
- cfd->GetName().c_str(), s.ToString().c_str());
6022
- delete v;
6023
- break;
6024
- }
6025
- }
6026
- }
6027
- if (s.ok()) {
6028
- next_file_number_.store(version_edit.next_file_number_ + 1);
6029
- last_allocated_sequence_ = version_edit.last_sequence_;
6030
- last_published_sequence_ = version_edit.last_sequence_;
6031
- last_sequence_ = version_edit.last_sequence_;
6032
- prev_log_number_ = version_edit.prev_log_number_;
6033
- for (auto cfd : *column_family_set_) {
6034
- if (cfd->IsDropped()) {
6035
- continue;
6036
- }
6037
- ROCKS_LOG_INFO(db_options_->info_log,
6038
- "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
6039
- cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
6040
- }
6041
- }
6042
- return s;
6130
+ return manifest_tailer_->status();
6043
6131
  }
6044
6132
 
6045
6133
  Status ReactiveVersionSet::ReadAndApply(
6046
6134
  InstrumentedMutex* mu,
6047
6135
  std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
6136
+ Status* manifest_read_status,
6048
6137
  std::unordered_set<ColumnFamilyData*>* cfds_changed) {
6049
6138
  assert(manifest_reader != nullptr);
6050
6139
  assert(cfds_changed != nullptr);
6051
6140
  mu->AssertHeld();
6052
6141
 
6053
6142
  Status s;
6054
- uint64_t applied_edits = 0;
6055
- while (s.ok()) {
6056
- Slice record;
6057
- std::string scratch;
6058
- log::Reader* reader = manifest_reader->get();
6059
- std::string old_manifest_path = reader->file()->file_name();
6060
- while (reader->ReadRecord(&record, &scratch)) {
6061
- VersionEdit edit;
6062
- s = edit.DecodeFrom(record);
6063
- if (!s.ok()) {
6064
- break;
6065
- }
6066
-
6067
- // Skip the first VersionEdits of each MANIFEST generated by
6068
- // VersionSet::WriteCurrentStatetoManifest.
6069
- if (number_of_edits_to_skip_ > 0) {
6070
- ColumnFamilyData* cfd =
6071
- column_family_set_->GetColumnFamily(edit.column_family_);
6072
- if (cfd != nullptr && !cfd->IsDropped()) {
6073
- --number_of_edits_to_skip_;
6074
- }
6075
- continue;
6076
- }
6077
-
6078
- s = read_buffer_.AddEdit(&edit);
6079
- if (!s.ok()) {
6080
- break;
6081
- }
6082
- VersionEdit temp_edit;
6083
- if (edit.is_in_atomic_group_) {
6084
- if (read_buffer_.IsFull()) {
6085
- // Apply edits in an atomic group when we have read all edits in the
6086
- // group.
6087
- for (auto& e : read_buffer_.replay_buffer()) {
6088
- s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit);
6089
- if (!s.ok()) {
6090
- break;
6091
- }
6092
- applied_edits++;
6093
- }
6094
- if (!s.ok()) {
6095
- break;
6096
- }
6097
- read_buffer_.Clear();
6098
- }
6099
- } else {
6100
- // Apply a normal edit immediately.
6101
- s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit);
6102
- if (s.ok()) {
6103
- applied_edits++;
6104
- } else {
6105
- break;
6106
- }
6107
- }
6108
- }
6109
- if (!s.ok()) {
6110
- // Clear the buffer if we fail to decode/apply an edit.
6111
- read_buffer_.Clear();
6112
- }
6113
- // It's possible that:
6114
- // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
6115
- // Or the version(s) rebuilt from tailing the MANIFEST is inconsistent.
6116
- // 2) we have finished reading the current MANIFEST.
6117
- // 3) we have encountered an IOError reading the current MANIFEST.
6118
- // We need to look for the next MANIFEST and start from there. If we cannot
6119
- // find the next MANIFEST, we should exit the loop.
6120
- Status tmp_s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
6121
- reader = manifest_reader->get();
6122
- if (tmp_s.ok()) {
6123
- if (reader->file()->file_name() == old_manifest_path) {
6124
- // Still processing the same MANIFEST, thus no need to continue this
6125
- // loop since no record is available if we have reached here.
6126
- break;
6127
- } else {
6128
- // We have switched to a new MANIFEST whose first records have been
6129
- // generated by VersionSet::WriteCurrentStatetoManifest. Since the
6130
- // secondary instance has already finished recovering upon start, there
6131
- // is no need for the secondary to process these records. Actually, if
6132
- // the secondary were to replay these records, the secondary may end up
6133
- // adding the same SST files AGAIN to each column family, causing
6134
- // consistency checks done by VersionBuilder to fail. Therefore, we
6135
- // record the number of records to skip at the beginning of the new
6136
- // MANIFEST and ignore them.
6137
- number_of_edits_to_skip_ = 0;
6138
- for (auto* cfd : *column_family_set_) {
6139
- if (cfd->IsDropped()) {
6140
- continue;
6141
- }
6142
- // Increase number_of_edits_to_skip by 2 because
6143
- // WriteCurrentStatetoManifest() writes 2 version edits for each
6144
- // column family at the beginning of the newly-generated MANIFEST.
6145
- // TODO(yanqin) remove hard-coded value.
6146
- if (db_options_->write_dbid_to_manifest) {
6147
- number_of_edits_to_skip_ += 3;
6148
- } else {
6149
- number_of_edits_to_skip_ += 2;
6150
- }
6151
- }
6152
- s = tmp_s;
6153
- }
6154
- }
6143
+ log::Reader* reader = manifest_reader->get();
6144
+ assert(reader);
6145
+ s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
6146
+ if (!s.ok()) {
6147
+ return s;
6155
6148
  }
6156
-
6149
+ manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status);
6150
+ s = manifest_tailer_->status();
6157
6151
  if (s.ok()) {
6158
- for (auto cfd : *column_family_set_) {
6159
- auto builder_iter = active_version_builders_.find(cfd->GetID());
6160
- if (builder_iter == active_version_builders_.end()) {
6161
- continue;
6162
- }
6163
- auto builder = builder_iter->second->version_builder();
6164
- if (!builder->CheckConsistencyForNumLevels()) {
6165
- s = Status::InvalidArgument(
6166
- "db has more levels than options.num_levels");
6167
- break;
6168
- }
6169
- }
6152
+ *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
6170
6153
  }
6171
- TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits",
6172
- &applied_edits);
6154
+
6173
6155
  return s;
6174
6156
  }
6175
6157
 
6176
- Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
6177
- VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
6178
- VersionEdit* version_edit) {
6179
- ColumnFamilyData* cfd =
6180
- column_family_set_->GetColumnFamily(edit.column_family_);
6181
-
6182
- // If we cannot find this column family in our column family set, then it
6183
- // may be a new column family created by the primary after the secondary
6184
- // starts. It is also possible that the secondary instance opens only a subset
6185
- // of column families. Ignore it for now.
6186
- if (nullptr == cfd) {
6187
- return Status::OK();
6188
- }
6189
- if (active_version_builders_.find(edit.column_family_) ==
6190
- active_version_builders_.end() &&
6191
- !cfd->IsDropped()) {
6192
- std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
6193
- new BaseReferencedVersionBuilder(cfd));
6194
- active_version_builders_.insert(
6195
- std::make_pair(edit.column_family_, std::move(builder_guard)));
6158
+ Status ReactiveVersionSet::MaybeSwitchManifest(
6159
+ log::Reader::Reporter* reporter,
6160
+ std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
6161
+ assert(manifest_reader != nullptr);
6162
+ Status s;
6163
+ std::string manifest_path;
6164
+ s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
6165
+ &manifest_file_number_);
6166
+ if (!s.ok()) {
6167
+ return s;
6196
6168
  }
6197
-
6198
- auto builder_iter = active_version_builders_.find(edit.column_family_);
6199
- assert(builder_iter != active_version_builders_.end());
6200
- auto builder = builder_iter->second->version_builder();
6201
- assert(builder != nullptr);
6202
-
6203
- if (edit.is_column_family_add_) {
6204
- // TODO (yanqin) for now the secondary ignores column families created
6205
- // after Open. This also simplifies handling of switching to a new MANIFEST
6206
- // and processing the snapshot of the system at the beginning of the
6169
+ std::unique_ptr<FSSequentialFile> manifest_file;
6170
+ if (manifest_reader->get() != nullptr &&
6171
+ manifest_reader->get()->file()->file_name() == manifest_path) {
6172
+ // CURRENT points to the same MANIFEST as before, no need to switch
6207
6173
  // MANIFEST.
6208
- } else if (edit.is_column_family_drop_) {
6209
- // Drop the column family by setting it to be 'dropped' without destroying
6210
- // the column family handle.
6211
- // TODO (haoyu) figure out how to handle column faimly drop for
6212
- // secondary instance. (Is it possible that the ref count for cfd is 0 but
6213
- // the ref count for its versions is higher than 0?)
6214
- cfd->SetDropped();
6215
- if (cfd->UnrefAndTryDelete()) {
6216
- cfd = nullptr;
6217
- }
6218
- active_version_builders_.erase(builder_iter);
6219
- } else {
6220
- Status s = builder->Apply(&edit);
6221
- if (!s.ok()) {
6222
- return s;
6223
- }
6224
- }
6225
- Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit);
6226
- if (!s.ok()) {
6227
6174
  return s;
6228
6175
  }
6229
-
6230
- if (cfd != nullptr && !cfd->IsDropped()) {
6231
- s = builder->LoadTableHandlers(
6232
- cfd->internal_stats(), db_options_->max_file_opening_threads,
6233
- false /* prefetch_index_and_filter_in_cache */,
6234
- false /* is_initial_load */,
6235
- cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
6236
- MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
6237
- TEST_SYNC_POINT_CALLBACK(
6238
- "ReactiveVersionSet::ApplyOneVersionEditToBuilder:"
6239
- "AfterLoadTableHandlers",
6240
- &s);
6241
-
6242
- if (s.ok()) {
6243
- auto version = new Version(cfd, this, file_options_,
6244
- *cfd->GetLatestMutableCFOptions(), io_tracer_,
6245
- current_version_number_++);
6246
- s = builder->SaveTo(version->storage_info());
6247
- if (s.ok()) {
6248
- version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
6249
- AppendVersion(cfd, version);
6250
- active_version_builders_.erase(builder_iter);
6251
- if (cfds_changed->count(cfd) == 0) {
6252
- cfds_changed->insert(cfd);
6253
- }
6254
- } else {
6255
- delete version;
6256
- }
6257
- } else if (s.IsPathNotFound()) {
6258
- s = Status::OK();
6259
- }
6260
- // Some other error has occurred during LoadTableHandlers.
6176
+ assert(nullptr == manifest_reader->get() ||
6177
+ manifest_reader->get()->file()->file_name() != manifest_path);
6178
+ s = fs_->FileExists(manifest_path, IOOptions(), nullptr);
6179
+ if (s.IsNotFound()) {
6180
+ return Status::TryAgain(
6181
+ "The primary may have switched to a new MANIFEST and deleted the old "
6182
+ "one.");
6183
+ } else if (!s.ok()) {
6184
+ return s;
6261
6185
  }
6262
-
6186
+ TEST_SYNC_POINT(
6187
+ "ReactiveVersionSet::MaybeSwitchManifest:"
6188
+ "AfterGetCurrentManifestPath:0");
6189
+ TEST_SYNC_POINT(
6190
+ "ReactiveVersionSet::MaybeSwitchManifest:"
6191
+ "AfterGetCurrentManifestPath:1");
6192
+ // The primary can also delete the MANIFEST while the secondary is reading
6193
+ // it. This is OK on POSIX. For other file systems, maybe create a hard link
6194
+ // to MANIFEST. The hard link should be cleaned up later by the secondary.
6195
+ s = fs_->NewSequentialFile(manifest_path,
6196
+ fs_->OptimizeForManifestRead(file_options_),
6197
+ &manifest_file, nullptr);
6198
+ std::unique_ptr<SequentialFileReader> manifest_file_reader;
6263
6199
  if (s.ok()) {
6264
- if (version_edit->HasNextFile()) {
6265
- next_file_number_.store(version_edit->next_file_number_ + 1);
6266
- }
6267
- if (version_edit->has_last_sequence_) {
6268
- last_allocated_sequence_ = version_edit->last_sequence_;
6269
- last_published_sequence_ = version_edit->last_sequence_;
6270
- last_sequence_ = version_edit->last_sequence_;
6271
- }
6272
- if (version_edit->has_prev_log_number_) {
6273
- prev_log_number_ = version_edit->prev_log_number_;
6274
- MarkFileNumberUsed(version_edit->prev_log_number_);
6275
- }
6276
- if (version_edit->has_log_number_) {
6277
- MarkFileNumberUsed(version_edit->log_number_);
6278
- }
6279
- column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_);
6280
- MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_);
6200
+ manifest_file_reader.reset(new SequentialFileReader(
6201
+ std::move(manifest_file), manifest_path,
6202
+ db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
6203
+ manifest_reader->reset(new log::FragmentBufferedReader(
6204
+ nullptr, std::move(manifest_file_reader), reporter, true /* checksum */,
6205
+ 0 /* log_number */));
6206
+ ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
6207
+ manifest_path.c_str());
6208
+ if (manifest_tailer_) {
6209
+ manifest_tailer_->PrepareToReadNewManifest();
6210
+ }
6211
+ } else if (s.IsPathNotFound()) {
6212
+ // This can happen if the primary switches to a new MANIFEST after the
6213
+ // secondary reads the CURRENT file but before the secondary actually tries
6214
+ // to open the MANIFEST.
6215
+ s = Status::TryAgain(
6216
+ "The primary may have switched to a new MANIFEST and deleted the old "
6217
+ "one.");
6281
6218
  }
6282
6219
  return s;
6283
6220
  }
6284
6221
 
6285
- Status ReactiveVersionSet::MaybeSwitchManifest(
6286
- log::Reader::Reporter* reporter,
6287
- std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
6288
- assert(manifest_reader != nullptr);
6289
- Status s;
6290
- do {
6291
- std::string manifest_path;
6292
- s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
6293
- &manifest_file_number_);
6294
- std::unique_ptr<FSSequentialFile> manifest_file;
6295
- if (s.ok()) {
6296
- if (nullptr == manifest_reader->get() ||
6297
- manifest_reader->get()->file()->file_name() != manifest_path) {
6298
- TEST_SYNC_POINT(
6299
- "ReactiveVersionSet::MaybeSwitchManifest:"
6300
- "AfterGetCurrentManifestPath:0");
6301
- TEST_SYNC_POINT(
6302
- "ReactiveVersionSet::MaybeSwitchManifest:"
6303
- "AfterGetCurrentManifestPath:1");
6304
- s = fs_->NewSequentialFile(manifest_path,
6305
- env_->OptimizeForManifestRead(file_options_),
6306
- &manifest_file, nullptr);
6307
- } else {
6308
- // No need to switch manifest.
6309
- break;
6310
- }
6311
- }
6312
- std::unique_ptr<SequentialFileReader> manifest_file_reader;
6313
- if (s.ok()) {
6314
- manifest_file_reader.reset(new SequentialFileReader(
6315
- std::move(manifest_file), manifest_path,
6316
- db_options_->log_readahead_size, io_tracer_));
6317
- manifest_reader->reset(new log::FragmentBufferedReader(
6318
- nullptr, std::move(manifest_file_reader), reporter,
6319
- true /* checksum */, 0 /* log_number */));
6320
- ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
6321
- manifest_path.c_str());
6322
- // TODO (yanqin) every time we switch to a new MANIFEST, we clear the
6323
- // active_version_builders_ map because we choose to construct the
6324
- // versions from scratch, thanks to the first part of each MANIFEST
6325
- // written by VersionSet::WriteCurrentStatetoManifest. This is not
6326
- // necessary, but we choose this at present for the sake of simplicity.
6327
- active_version_builders_.clear();
6328
- }
6329
- } while (s.IsPathNotFound());
6330
- return s;
6222
+ #ifndef NDEBUG
6223
+ uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const {
6224
+ assert(manifest_tailer_);
6225
+ return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group();
6226
+ }
6227
+ #endif // !NDEBUG
6228
+
6229
+ std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() {
6230
+ assert(manifest_tailer_);
6231
+ return manifest_tailer_->GetReadBuffer().replay_buffer();
6331
6232
  }
6332
6233
 
6333
6234
  } // namespace ROCKSDB_NAMESPACE