@nxtedition/rocksdb 5.2.21 → 5.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (923) hide show
  1. package/binding.cc +510 -967
  2. package/binding.gyp +78 -72
  3. package/chained-batch.js +1 -2
  4. package/deps/rocksdb/build_version.cc +70 -4
  5. package/deps/rocksdb/rocksdb/CMakeLists.txt +281 -149
  6. package/deps/rocksdb/rocksdb/Makefile +459 -469
  7. package/deps/rocksdb/rocksdb/TARGETS +5244 -1500
  8. package/deps/rocksdb/rocksdb/cache/cache.cc +12 -3
  9. package/deps/rocksdb/rocksdb/cache/cache_bench.cc +7 -368
  10. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +924 -0
  11. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +128 -0
  12. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.h +103 -0
  13. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +183 -0
  14. package/deps/rocksdb/rocksdb/cache/cache_helpers.h +11 -0
  15. package/deps/rocksdb/rocksdb/cache/cache_key.cc +344 -0
  16. package/deps/rocksdb/rocksdb/cache/cache_key.h +132 -0
  17. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +183 -0
  18. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +288 -0
  19. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +468 -0
  20. package/deps/rocksdb/rocksdb/cache/cache_test.cc +85 -8
  21. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +121 -51
  22. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +171 -0
  23. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +86 -0
  24. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +607 -0
  25. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +381 -154
  26. package/deps/rocksdb/rocksdb/cache/lru_cache.h +176 -33
  27. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1659 -3
  28. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +94 -23
  29. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +49 -28
  30. package/deps/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake +7 -0
  31. package/deps/rocksdb/rocksdb/cmake/modules/FindJeMalloc.cmake +29 -0
  32. package/deps/rocksdb/rocksdb/cmake/modules/FindNUMA.cmake +29 -0
  33. package/deps/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake +29 -0
  34. package/deps/rocksdb/rocksdb/cmake/modules/FindTBB.cmake +33 -0
  35. package/deps/rocksdb/rocksdb/cmake/modules/Findgflags.cmake +29 -0
  36. package/deps/rocksdb/rocksdb/cmake/modules/Findlz4.cmake +29 -0
  37. package/deps/rocksdb/rocksdb/cmake/modules/Finduring.cmake +26 -0
  38. package/deps/rocksdb/rocksdb/cmake/modules/Findzstd.cmake +29 -0
  39. package/deps/rocksdb/rocksdb/cmake/modules/ReadVersion.cmake +10 -0
  40. package/deps/rocksdb/rocksdb/crash_test.mk +93 -0
  41. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +54 -31
  42. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +10 -6
  43. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +146 -0
  44. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc +326 -0
  45. package/deps/rocksdb/rocksdb/db/blob/blob_fetcher.cc +34 -0
  46. package/deps/rocksdb/rocksdb/db/blob/blob_fetcher.h +37 -0
  47. package/deps/rocksdb/rocksdb/db/blob/blob_file_addition.cc +4 -2
  48. package/deps/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc +8 -4
  49. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +99 -40
  50. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +20 -8
  51. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +95 -83
  52. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +13 -10
  53. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +7 -4
  54. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +37 -37
  55. package/deps/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h +101 -0
  56. package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.cc +8 -1
  57. package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +6 -0
  58. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +209 -44
  59. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +37 -11
  60. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +382 -179
  61. package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc +100 -0
  62. package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter.h +102 -0
  63. package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc +196 -0
  64. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +3 -0
  65. package/deps/rocksdb/rocksdb/db/blob/blob_log_format.h +2 -1
  66. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +7 -5
  67. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h +10 -3
  68. package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +12 -8
  69. package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.h +5 -5
  70. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +772 -9
  71. package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +730 -0
  72. package/deps/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc +82 -0
  73. package/deps/rocksdb/rocksdb/db/blob/db_blob_index_test.cc +155 -17
  74. package/deps/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc +21 -0
  75. package/deps/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h +38 -0
  76. package/deps/rocksdb/rocksdb/db/builder.cc +137 -89
  77. package/deps/rocksdb/rocksdb/db/builder.h +16 -37
  78. package/deps/rocksdb/rocksdb/db/c.cc +413 -208
  79. package/deps/rocksdb/rocksdb/db/c_test.c +227 -138
  80. package/deps/rocksdb/rocksdb/db/column_family.cc +118 -103
  81. package/deps/rocksdb/rocksdb/db/column_family.h +86 -44
  82. package/deps/rocksdb/rocksdb/db/column_family_test.cc +38 -24
  83. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +81 -0
  84. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +275 -0
  85. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc +258 -0
  86. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +81 -28
  87. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +43 -12
  88. package/deps/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h +12 -0
  89. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +406 -215
  90. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +147 -50
  91. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +167 -61
  92. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +1321 -156
  93. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +197 -28
  94. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -3
  95. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +246 -43
  96. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +65 -26
  97. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +7 -7
  98. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +122 -9
  99. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -2
  100. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +18 -6
  101. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -1
  102. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +536 -44
  103. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +311 -30
  104. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +1 -1
  105. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +849 -0
  106. package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +92 -0
  107. package/deps/rocksdb/rocksdb/db/compaction/sst_partitioner.cc +46 -0
  108. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/convenience.cc +6 -3
  110. package/deps/rocksdb/rocksdb/db/corruption_test.cc +383 -28
  111. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +7 -2
  112. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +154 -45
  113. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +1095 -33
  114. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +1249 -203
  115. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +135 -9
  116. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +1348 -166
  117. package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +3 -5
  118. package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +1 -1
  119. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +312 -45
  120. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +1734 -48
  121. package/deps/rocksdb/rocksdb/db/{compacted_db_impl.cc → db_impl/compacted_db_impl.cc} +24 -7
  122. package/deps/rocksdb/rocksdb/db/{compacted_db_impl.h → db_impl/compacted_db_impl.h} +1 -1
  123. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +644 -333
  124. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +365 -92
  125. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +578 -210
  126. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +38 -16
  127. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +17 -10
  128. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +75 -74
  129. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +450 -183
  130. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +42 -9
  131. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +232 -15
  132. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +42 -4
  133. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +297 -100
  134. package/deps/rocksdb/rocksdb/db/db_info_dumper.cc +16 -15
  135. package/deps/rocksdb/rocksdb/db/db_inplace_update_test.cc +31 -1
  136. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +6 -5
  137. package/deps/rocksdb/rocksdb/db/db_iter.cc +218 -153
  138. package/deps/rocksdb/rocksdb/db/db_iter.h +14 -12
  139. package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +1 -1
  140. package/deps/rocksdb/rocksdb/db/db_iter_test.cc +84 -160
  141. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +47 -6
  142. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +204 -0
  143. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +21 -13
  144. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +17 -10
  145. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +38 -24
  146. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +184 -19
  147. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +1 -1
  148. package/deps/rocksdb/rocksdb/db/db_options_test.cc +183 -3
  149. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +409 -9
  150. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +92 -23
  151. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +446 -0
  152. package/deps/rocksdb/rocksdb/db/{db_impl/db_secondary_test.cc → db_secondary_test.cc} +363 -35
  153. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +520 -15
  154. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +50 -1
  155. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +139 -4
  156. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +1 -1
  157. package/deps/rocksdb/rocksdb/db/db_test.cc +669 -359
  158. package/deps/rocksdb/rocksdb/db/db_test2.cc +2110 -304
  159. package/deps/rocksdb/rocksdb/db/db_test_util.cc +76 -43
  160. package/deps/rocksdb/rocksdb/db/db_test_util.h +231 -103
  161. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +19 -11
  162. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +490 -71
  163. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +980 -349
  164. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +11 -12
  165. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +793 -0
  166. package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -1
  167. package/deps/rocksdb/rocksdb/db/dbformat.cc +4 -12
  168. package/deps/rocksdb/rocksdb/db/dbformat.h +28 -18
  169. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +3 -0
  170. package/deps/rocksdb/rocksdb/db/deletefile_test.cc +50 -15
  171. package/deps/rocksdb/rocksdb/db/error_handler.cc +127 -41
  172. package/deps/rocksdb/rocksdb/db/error_handler.h +12 -5
  173. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +524 -255
  174. package/deps/rocksdb/rocksdb/db/event_helpers.cc +136 -11
  175. package/deps/rocksdb/rocksdb/db/event_helpers.h +27 -2
  176. package/deps/rocksdb/rocksdb/db/experimental.cc +100 -0
  177. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +307 -4
  178. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +137 -60
  179. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +12 -8
  180. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +86 -55
  181. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +86 -5
  182. package/deps/rocksdb/rocksdb/db/filename_test.cc +63 -0
  183. package/deps/rocksdb/rocksdb/db/flush_job.cc +619 -64
  184. package/deps/rocksdb/rocksdb/db/flush_job.h +30 -7
  185. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +33 -16
  186. package/deps/rocksdb/rocksdb/db/flush_scheduler.h +2 -1
  187. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +18 -17
  188. package/deps/rocksdb/rocksdb/db/forward_iterator.h +5 -4
  189. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +0 -1
  190. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +91 -0
  191. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +25 -14
  192. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +6 -5
  193. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +1 -1
  194. package/deps/rocksdb/rocksdb/db/internal_stats.cc +471 -50
  195. package/deps/rocksdb/rocksdb/db/internal_stats.h +129 -25
  196. package/deps/rocksdb/rocksdb/db/job_context.h +22 -9
  197. package/deps/rocksdb/rocksdb/db/kv_checksum.h +394 -0
  198. package/deps/rocksdb/rocksdb/db/listener_test.cc +518 -41
  199. package/deps/rocksdb/rocksdb/db/log_format.h +4 -1
  200. package/deps/rocksdb/rocksdb/db/log_reader.cc +129 -6
  201. package/deps/rocksdb/rocksdb/db/log_reader.h +17 -1
  202. package/deps/rocksdb/rocksdb/db/log_test.cc +161 -11
  203. package/deps/rocksdb/rocksdb/db/log_writer.cc +92 -13
  204. package/deps/rocksdb/rocksdb/db/log_writer.h +18 -5
  205. package/deps/rocksdb/rocksdb/db/logs_with_prep_tracker.h +1 -1
  206. package/deps/rocksdb/rocksdb/db/lookup_key.h +0 -1
  207. package/deps/rocksdb/rocksdb/db/malloc_stats.cc +2 -2
  208. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +21 -8
  209. package/deps/rocksdb/rocksdb/db/memtable.cc +144 -54
  210. package/deps/rocksdb/rocksdb/db/memtable.h +72 -15
  211. package/deps/rocksdb/rocksdb/db/memtable_list.cc +95 -47
  212. package/deps/rocksdb/rocksdb/db/memtable_list.h +33 -13
  213. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +61 -31
  214. package/deps/rocksdb/rocksdb/db/merge_context.h +20 -8
  215. package/deps/rocksdb/rocksdb/db/merge_helper.cc +54 -11
  216. package/deps/rocksdb/rocksdb/db/merge_helper.h +17 -6
  217. package/deps/rocksdb/rocksdb/db/merge_helper_test.cc +13 -7
  218. package/deps/rocksdb/rocksdb/db/merge_test.cc +40 -19
  219. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +14 -25
  220. package/deps/rocksdb/rocksdb/db/output_validator.cc +3 -0
  221. package/deps/rocksdb/rocksdb/db/output_validator.h +5 -4
  222. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +32 -28
  223. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +43 -29
  224. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +9 -7
  225. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc +21 -16
  226. package/deps/rocksdb/rocksdb/db/pinned_iterators_manager.h +1 -1
  227. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +29 -36
  228. package/deps/rocksdb/rocksdb/db/pre_release_callback.h +1 -2
  229. package/deps/rocksdb/rocksdb/db/prefix_test.cc +4 -4
  230. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +2 -2
  231. package/deps/rocksdb/rocksdb/db/range_del_aggregator_bench.cc +11 -11
  232. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +3 -2
  233. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc +14 -8
  234. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +17 -0
  235. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc +4 -2
  236. package/deps/rocksdb/rocksdb/db/read_callback.h +1 -0
  237. package/deps/rocksdb/rocksdb/db/repair.cc +87 -58
  238. package/deps/rocksdb/rocksdb/db/repair_test.cc +35 -5
  239. package/deps/rocksdb/rocksdb/db/snapshot_impl.h +2 -1
  240. package/deps/rocksdb/rocksdb/db/table_cache.cc +95 -69
  241. package/deps/rocksdb/rocksdb/db/table_cache.h +63 -53
  242. package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +4 -4
  243. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +78 -10
  244. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +28 -33
  245. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +30 -51
  246. package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +12 -8
  247. package/deps/rocksdb/rocksdb/db/version_builder.cc +564 -341
  248. package/deps/rocksdb/rocksdb/db/version_builder.h +8 -8
  249. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +327 -155
  250. package/deps/rocksdb/rocksdb/db/version_edit.cc +89 -27
  251. package/deps/rocksdb/rocksdb/db/version_edit.h +42 -17
  252. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +324 -43
  253. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +79 -22
  254. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +165 -20
  255. package/deps/rocksdb/rocksdb/db/version_set.cc +935 -1034
  256. package/deps/rocksdb/rocksdb/db/version_set.h +183 -122
  257. package/deps/rocksdb/rocksdb/db/version_set_test.cc +556 -138
  258. package/deps/rocksdb/rocksdb/db/version_util.h +68 -0
  259. package/deps/rocksdb/rocksdb/db/wal_manager.cc +23 -21
  260. package/deps/rocksdb/rocksdb/db/wal_manager.h +5 -2
  261. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +30 -27
  262. package/deps/rocksdb/rocksdb/db/write_batch.cc +704 -209
  263. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +135 -2
  264. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +209 -5
  265. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +2 -0
  266. package/deps/rocksdb/rocksdb/db/write_controller.cc +47 -54
  267. package/deps/rocksdb/rocksdb/db/write_controller.h +12 -9
  268. package/deps/rocksdb/rocksdb/db/write_controller_test.cc +215 -103
  269. package/deps/rocksdb/rocksdb/db/write_thread.cc +11 -0
  270. package/deps/rocksdb/rocksdb/db/write_thread.h +14 -8
  271. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +7 -4
  272. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +10 -3
  273. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +6 -0
  274. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress.cc +1 -1
  275. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -2
  276. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +78 -25
  277. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h +13 -2
  278. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +29 -12
  279. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +5 -1
  280. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +199 -32
  281. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc +188 -0
  282. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +59 -10
  283. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +77 -109
  284. package/deps/rocksdb/rocksdb/{third-party/folly/folly/synchronization/WaitOptions.cpp → db_stress_tool/db_stress_stat.cc} +9 -4
  285. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +7 -6
  286. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +1 -0
  287. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +699 -143
  288. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +20 -2
  289. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +49 -39
  290. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +631 -0
  291. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +287 -0
  292. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +1565 -0
  293. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +374 -0
  294. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +149 -18
  295. package/deps/rocksdb/rocksdb/env/composite_env.cc +464 -0
  296. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +98 -646
  297. package/deps/rocksdb/rocksdb/env/emulated_clock.h +114 -0
  298. package/deps/rocksdb/rocksdb/env/env.cc +632 -42
  299. package/deps/rocksdb/rocksdb/env/env_basic_test.cc +84 -36
  300. package/deps/rocksdb/rocksdb/env/env_chroot.cc +88 -286
  301. package/deps/rocksdb/rocksdb/env/env_chroot.h +34 -1
  302. package/deps/rocksdb/rocksdb/env/env_encryption.cc +469 -277
  303. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +9 -30
  304. package/deps/rocksdb/rocksdb/env/env_posix.cc +110 -119
  305. package/deps/rocksdb/rocksdb/env/env_test.cc +1128 -39
  306. package/deps/rocksdb/rocksdb/env/file_system.cc +147 -8
  307. package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +207 -136
  308. package/deps/rocksdb/rocksdb/env/file_system_tracer.h +86 -54
  309. package/deps/rocksdb/rocksdb/env/fs_posix.cc +192 -64
  310. package/deps/rocksdb/rocksdb/env/fs_readonly.h +107 -0
  311. package/deps/rocksdb/rocksdb/env/fs_remap.cc +339 -0
  312. package/deps/rocksdb/rocksdb/env/fs_remap.h +139 -0
  313. package/deps/rocksdb/rocksdb/env/io_posix.cc +245 -41
  314. package/deps/rocksdb/rocksdb/env/io_posix.h +66 -1
  315. package/deps/rocksdb/rocksdb/env/mock_env.cc +147 -149
  316. package/deps/rocksdb/rocksdb/env/mock_env.h +113 -11
  317. package/deps/rocksdb/rocksdb/env/mock_env_test.cc +2 -4
  318. package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +164 -0
  319. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +71 -0
  320. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +9 -5
  321. package/deps/rocksdb/rocksdb/file/delete_scheduler.h +6 -4
  322. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +19 -12
  323. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +459 -70
  324. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +205 -28
  325. package/deps/rocksdb/rocksdb/file/file_util.cc +39 -28
  326. package/deps/rocksdb/rocksdb/file/file_util.h +18 -27
  327. package/deps/rocksdb/rocksdb/file/filename.cc +59 -22
  328. package/deps/rocksdb/rocksdb/file/filename.h +13 -8
  329. package/deps/rocksdb/rocksdb/file/line_file_reader.cc +68 -0
  330. package/deps/rocksdb/rocksdb/file/line_file_reader.h +59 -0
  331. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +1130 -6
  332. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +220 -36
  333. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +69 -17
  334. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +13 -12
  335. package/deps/rocksdb/rocksdb/file/read_write_util.cc +3 -38
  336. package/deps/rocksdb/rocksdb/file/read_write_util.h +0 -4
  337. package/deps/rocksdb/rocksdb/file/readahead_file_info.h +33 -0
  338. package/deps/rocksdb/rocksdb/file/sequence_file_reader.cc +57 -9
  339. package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +58 -6
  340. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +29 -54
  341. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +22 -29
  342. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +424 -50
  343. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +66 -19
  344. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +157 -66
  345. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +224 -121
  346. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +333 -30
  347. package/deps/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h +14 -0
  348. package/deps/rocksdb/rocksdb/include/rocksdb/cleanable.h +1 -1
  349. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +90 -50
  350. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +13 -5
  351. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +20 -4
  352. package/deps/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h +8 -3
  353. package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +53 -12
  354. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +31 -6
  355. package/deps/rocksdb/rocksdb/include/rocksdb/customizable.h +102 -7
  356. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +51 -0
  357. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +370 -262
  358. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +286 -87
  359. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +124 -64
  360. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +27 -0
  361. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +21 -4
  362. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +384 -41
  363. package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +111 -143
  364. package/deps/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h +20 -6
  365. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +56 -0
  366. package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +15 -33
  367. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +37 -1
  368. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -3
  369. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +314 -26
  370. package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +11 -7
  371. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +50 -15
  372. package/deps/rocksdb/rocksdb/include/rocksdb/merge_operator.h +10 -3
  373. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +186 -96
  374. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +373 -103
  375. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +13 -3
  376. package/deps/rocksdb/rocksdb/include/rocksdb/persistent_cache.h +2 -2
  377. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +37 -7
  378. package/deps/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h +6 -0
  379. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +87 -0
  380. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +5 -12
  381. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +59 -30
  382. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +11 -11
  383. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +22 -0
  384. package/deps/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h +17 -10
  385. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +121 -41
  386. package/deps/rocksdb/rocksdb/include/rocksdb/stats_history.h +1 -0
  387. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +114 -136
  388. package/deps/rocksdb/rocksdb/include/rocksdb/system_clock.h +116 -0
  389. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +160 -18
  390. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +57 -15
  391. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +3 -1
  392. package/deps/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h +10 -6
  393. package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +247 -0
  394. package/deps/rocksdb/rocksdb/include/rocksdb/trace_record_result.h +187 -0
  395. package/deps/rocksdb/rocksdb/include/rocksdb/transaction_log.h +1 -1
  396. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +14 -24
  397. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +46 -0
  398. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +14 -4
  399. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/agg_merge.h +138 -0
  400. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +631 -0
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +142 -0
  402. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h +12 -9
  403. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h +368 -0
  404. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +24 -0
  405. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h +4 -0
  406. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +418 -63
  407. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +143 -73
  408. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +2 -2
  409. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h +87 -0
  410. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h +2 -2
  411. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +43 -5
  412. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +18 -23
  413. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +26 -0
  414. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +32 -6
  415. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +1 -2
  416. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +20 -1
  417. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +30 -3
  418. package/deps/rocksdb/rocksdb/include/rocksdb/wal_filter.h +11 -2
  419. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +89 -11
  420. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch_base.h +11 -0
  421. package/deps/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h +108 -38
  422. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +40 -23
  423. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.h +12 -5
  424. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +100 -49
  425. package/deps/rocksdb/rocksdb/logging/env_logger.h +7 -5
  426. package/deps/rocksdb/rocksdb/logging/env_logger_test.cc +0 -1
  427. package/deps/rocksdb/rocksdb/logging/posix_logger.h +3 -9
  428. package/deps/rocksdb/rocksdb/memory/arena.cc +3 -1
  429. package/deps/rocksdb/rocksdb/memory/arena.h +1 -1
  430. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +171 -106
  431. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +31 -15
  432. package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc +15 -4
  433. package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator.h +24 -8
  434. package/deps/rocksdb/rocksdb/memory/memory_allocator.cc +91 -0
  435. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +239 -0
  436. package/deps/rocksdb/rocksdb/memory/memory_usage.h +14 -1
  437. package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +72 -9
  438. package/deps/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc +52 -6
  439. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +53 -0
  440. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +5 -5
  441. package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +17 -5
  442. package/deps/rocksdb/rocksdb/memtable/skiplist_test.cc +1 -1
  443. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +87 -0
  444. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +20 -10
  445. package/deps/rocksdb/rocksdb/memtable/write_buffer_manager.cc +148 -94
  446. package/deps/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc +160 -62
  447. package/deps/rocksdb/rocksdb/microbench/CMakeLists.txt +17 -0
  448. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +1360 -0
  449. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +153 -0
  450. package/deps/rocksdb/rocksdb/monitoring/histogram.cc +8 -15
  451. package/deps/rocksdb/rocksdb/monitoring/histogram.h +0 -1
  452. package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +18 -16
  453. package/deps/rocksdb/rocksdb/monitoring/histogram_windowing.cc +9 -7
  454. package/deps/rocksdb/rocksdb/monitoring/histogram_windowing.h +5 -3
  455. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.cc +7 -5
  456. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +37 -12
  457. package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +26 -6
  458. package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +6 -10
  459. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +14 -13
  460. package/deps/rocksdb/rocksdb/monitoring/perf_context_imp.h +19 -20
  461. package/deps/rocksdb/rocksdb/monitoring/perf_step_timer.h +18 -18
  462. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +84 -2
  463. package/deps/rocksdb/rocksdb/monitoring/statistics.h +6 -0
  464. package/deps/rocksdb/rocksdb/monitoring/statistics_test.cc +47 -2
  465. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +67 -54
  466. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +4 -1
  467. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +2 -1
  468. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -2
  469. package/deps/rocksdb/rocksdb/options/cf_options.cc +280 -212
  470. package/deps/rocksdb/rocksdb/options/cf_options.h +51 -57
  471. package/deps/rocksdb/rocksdb/options/configurable.cc +242 -138
  472. package/deps/rocksdb/rocksdb/options/configurable_helper.h +4 -68
  473. package/deps/rocksdb/rocksdb/options/configurable_test.cc +144 -21
  474. package/deps/rocksdb/rocksdb/options/configurable_test.h +2 -3
  475. package/deps/rocksdb/rocksdb/options/customizable.cc +67 -7
  476. package/deps/rocksdb/rocksdb/options/customizable_test.cc +1773 -151
  477. package/deps/rocksdb/rocksdb/options/db_options.cc +275 -47
  478. package/deps/rocksdb/rocksdb/options/db_options.h +36 -7
  479. package/deps/rocksdb/rocksdb/options/options.cc +49 -17
  480. package/deps/rocksdb/rocksdb/options/options_helper.cc +369 -352
  481. package/deps/rocksdb/rocksdb/options/options_helper.h +23 -23
  482. package/deps/rocksdb/rocksdb/options/options_parser.cc +18 -13
  483. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +67 -54
  484. package/deps/rocksdb/rocksdb/options/options_test.cc +1162 -187
  485. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +1 -1
  486. package/deps/rocksdb/rocksdb/port/lang.h +52 -0
  487. package/deps/rocksdb/rocksdb/port/port_example.h +1 -1
  488. package/deps/rocksdb/rocksdb/port/port_posix.cc +31 -2
  489. package/deps/rocksdb/rocksdb/port/port_posix.h +20 -2
  490. package/deps/rocksdb/rocksdb/port/stack_trace.cc +20 -4
  491. package/deps/rocksdb/rocksdb/port/sys_time.h +2 -2
  492. package/deps/rocksdb/rocksdb/port/win/env_default.cc +7 -7
  493. package/deps/rocksdb/rocksdb/port/win/env_win.cc +44 -74
  494. package/deps/rocksdb/rocksdb/port/win/env_win.h +25 -23
  495. package/deps/rocksdb/rocksdb/port/win/io_win.cc +32 -34
  496. package/deps/rocksdb/rocksdb/port/win/io_win.h +12 -6
  497. package/deps/rocksdb/rocksdb/port/win/port_win.cc +55 -35
  498. package/deps/rocksdb/rocksdb/port/win/port_win.h +22 -5
  499. package/deps/rocksdb/rocksdb/port/win/win_logger.cc +3 -3
  500. package/deps/rocksdb/rocksdb/port/win/win_logger.h +3 -5
  501. package/deps/rocksdb/rocksdb/port/win/win_thread.cc +7 -1
  502. package/deps/rocksdb/rocksdb/port/win/win_thread.h +12 -17
  503. package/deps/rocksdb/rocksdb/python.mk +9 -0
  504. package/deps/rocksdb/rocksdb/src.mk +82 -34
  505. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -4
  506. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +1 -1
  507. package/deps/rocksdb/rocksdb/table/block_based/block.cc +158 -80
  508. package/deps/rocksdb/rocksdb/table/block_based/block.h +64 -36
  509. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc +23 -14
  510. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.h +13 -5
  511. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc +3 -218
  512. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +603 -328
  513. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +28 -22
  514. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +220 -82
  515. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +8 -2
  516. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +3 -4
  517. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +28 -4
  518. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +598 -492
  519. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +151 -96
  520. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +31 -58
  521. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +330 -92
  522. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +50 -19
  523. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +23 -0
  524. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +226 -0
  525. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +56 -22
  526. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +42 -4
  527. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +5 -2
  528. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +2 -0
  529. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +34 -20
  530. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +9 -10
  531. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +26 -3
  532. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +2 -1
  533. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +844 -202
  534. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +281 -81
  535. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +62 -2
  536. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.h +2 -3
  537. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +28 -7
  538. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +22 -6
  539. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +28 -26
  540. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
  541. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +1 -2
  542. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +2 -1
  543. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +11 -4
  544. package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.cc +2 -1
  545. package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h +2 -0
  546. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +68 -26
  547. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +44 -9
  548. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +12 -10
  549. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +3 -4
  550. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h +23 -4
  551. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +44 -19
  552. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +5 -1
  553. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +16 -28
  554. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +7 -4
  555. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +2 -2
  556. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +77 -57
  557. package/deps/rocksdb/rocksdb/table/block_fetcher.h +23 -12
  558. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +43 -56
  559. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +8 -8
  560. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +2 -1
  561. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +52 -70
  562. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc +5 -8
  563. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +1 -1
  564. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +17 -11
  565. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h +2 -3
  566. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +42 -51
  567. package/deps/rocksdb/rocksdb/table/format.cc +258 -104
  568. package/deps/rocksdb/rocksdb/table/format.h +120 -109
  569. package/deps/rocksdb/rocksdb/table/get_context.cc +97 -65
  570. package/deps/rocksdb/rocksdb/table/get_context.h +19 -12
  571. package/deps/rocksdb/rocksdb/table/internal_iterator.h +14 -0
  572. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
  573. package/deps/rocksdb/rocksdb/table/merger_test.cc +3 -2
  574. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +11 -21
  575. package/deps/rocksdb/rocksdb/table/merging_iterator.h +3 -3
  576. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +176 -171
  577. package/deps/rocksdb/rocksdb/table/meta_blocks.h +47 -33
  578. package/deps/rocksdb/rocksdb/table/mock_table.cc +7 -9
  579. package/deps/rocksdb/rocksdb/table/mock_table.h +3 -2
  580. package/deps/rocksdb/rocksdb/table/multiget_context.h +15 -8
  581. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.cc +22 -29
  582. package/deps/rocksdb/rocksdb/table/persistent_cache_options.h +6 -3
  583. package/deps/rocksdb/rocksdb/table/plain/plain_table_bloom.h +5 -8
  584. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +29 -26
  585. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +12 -16
  586. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.cc +145 -69
  587. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +1 -1
  588. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +7 -6
  589. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +3 -4
  590. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +3 -1
  591. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +1 -1
  592. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +13 -18
  593. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.h +4 -9
  594. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +55 -37
  595. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +10 -5
  596. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +11 -8
  597. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +222 -16
  598. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +106 -58
  599. package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +6 -5
  600. package/deps/rocksdb/rocksdb/table/table_builder.h +68 -44
  601. package/deps/rocksdb/rocksdb/table/table_factory.cc +37 -10
  602. package/deps/rocksdb/rocksdb/table/table_properties.cc +109 -54
  603. package/deps/rocksdb/rocksdb/table/table_properties_internal.h +4 -20
  604. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +33 -32
  605. package/deps/rocksdb/rocksdb/table/table_reader_caller.h +2 -0
  606. package/deps/rocksdb/rocksdb/table/table_test.cc +989 -326
  607. package/deps/rocksdb/rocksdb/table/two_level_iterator.cc +4 -0
  608. package/deps/rocksdb/rocksdb/table/unique_id.cc +166 -0
  609. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +59 -0
  610. package/deps/rocksdb/rocksdb/test_util/mock_time_env.cc +1 -1
  611. package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +13 -10
  612. package/deps/rocksdb/rocksdb/test_util/sync_point.cc +1 -2
  613. package/deps/rocksdb/rocksdb/test_util/sync_point.h +35 -16
  614. package/deps/rocksdb/rocksdb/test_util/sync_point_impl.cc +32 -10
  615. package/deps/rocksdb/rocksdb/test_util/sync_point_impl.h +31 -4
  616. package/deps/rocksdb/rocksdb/test_util/testharness.cc +53 -1
  617. package/deps/rocksdb/rocksdb/test_util/testharness.h +67 -3
  618. package/deps/rocksdb/rocksdb/test_util/testutil.cc +236 -66
  619. package/deps/rocksdb/rocksdb/test_util/testutil.h +63 -100
  620. package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +12 -1
  621. package/deps/rocksdb/rocksdb/tools/blob_dump.cc +2 -2
  622. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +6 -3
  623. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h +1 -0
  624. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +9 -3
  625. package/deps/rocksdb/rocksdb/tools/db_bench.cc +1 -1
  626. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +1420 -611
  627. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +11 -8
  628. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +11 -1
  629. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +4 -2
  630. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc +46 -22
  631. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +655 -179
  632. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +58 -6
  633. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +472 -29
  634. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +23 -2
  635. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +2 -2
  636. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +246 -0
  637. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +126 -0
  638. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +83 -29
  639. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +38 -17
  640. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +191 -55
  641. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +219 -296
  642. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +87 -53
  643. package/deps/rocksdb/rocksdb/tools/write_stress.cc +8 -7
  644. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc +6 -5
  645. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +5 -4
  646. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc +14 -9
  647. package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +134 -60
  648. package/deps/rocksdb/rocksdb/trace_replay/io_tracer.h +49 -38
  649. package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +152 -15
  650. package/deps/rocksdb/rocksdb/trace_replay/trace_record.cc +206 -0
  651. package/deps/rocksdb/rocksdb/trace_replay/trace_record_handler.cc +190 -0
  652. package/deps/rocksdb/rocksdb/trace_replay/trace_record_handler.h +46 -0
  653. package/deps/rocksdb/rocksdb/trace_replay/trace_record_result.cc +146 -0
  654. package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +475 -344
  655. package/deps/rocksdb/rocksdb/trace_replay/trace_replay.h +83 -95
  656. package/deps/rocksdb/rocksdb/util/autovector.h +38 -18
  657. package/deps/rocksdb/rocksdb/util/autovector_test.cc +1 -1
  658. package/deps/rocksdb/rocksdb/util/bloom_impl.h +4 -0
  659. package/deps/rocksdb/rocksdb/util/bloom_test.cc +276 -94
  660. package/deps/rocksdb/rocksdb/util/build_version.cc.in +81 -4
  661. package/deps/rocksdb/rocksdb/util/cast_util.h +22 -0
  662. package/deps/rocksdb/rocksdb/util/channel.h +2 -0
  663. package/deps/rocksdb/rocksdb/util/coding.h +1 -33
  664. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +8 -0
  665. package/deps/rocksdb/rocksdb/util/comparator.cc +163 -3
  666. package/deps/rocksdb/rocksdb/util/compression.cc +122 -0
  667. package/deps/rocksdb/rocksdb/util/compression.h +212 -7
  668. package/deps/rocksdb/rocksdb/util/compression_context_cache.cc +1 -3
  669. package/deps/rocksdb/rocksdb/util/crc32c.cc +165 -2
  670. package/deps/rocksdb/rocksdb/util/crc32c.h +6 -0
  671. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +14 -0
  672. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +3 -0
  673. package/deps/rocksdb/rocksdb/util/crc32c_test.cc +47 -0
  674. package/deps/rocksdb/rocksdb/util/defer.h +30 -1
  675. package/deps/rocksdb/rocksdb/util/defer_test.cc +11 -0
  676. package/deps/rocksdb/rocksdb/util/duplicate_detector.h +3 -1
  677. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +3 -3
  678. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +5 -4
  679. package/deps/rocksdb/rocksdb/util/fastrange.h +2 -0
  680. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +36 -0
  681. package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +3 -1
  682. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +512 -52
  683. package/deps/rocksdb/rocksdb/util/filter_bench.cc +65 -10
  684. package/deps/rocksdb/rocksdb/util/gflags_compat.h +6 -1
  685. package/deps/rocksdb/rocksdb/util/hash.cc +121 -3
  686. package/deps/rocksdb/rocksdb/util/hash.h +31 -1
  687. package/deps/rocksdb/rocksdb/util/hash128.h +26 -0
  688. package/deps/rocksdb/rocksdb/util/hash_containers.h +51 -0
  689. package/deps/rocksdb/rocksdb/util/hash_test.cc +194 -2
  690. package/deps/rocksdb/rocksdb/util/heap.h +6 -1
  691. package/deps/rocksdb/rocksdb/util/kv_map.h +1 -1
  692. package/deps/rocksdb/rocksdb/util/log_write_bench.cc +8 -6
  693. package/deps/rocksdb/rocksdb/util/math.h +74 -7
  694. package/deps/rocksdb/rocksdb/util/math128.h +13 -1
  695. package/deps/rocksdb/rocksdb/util/murmurhash.h +3 -3
  696. package/deps/rocksdb/rocksdb/util/random.cc +9 -0
  697. package/deps/rocksdb/rocksdb/util/random.h +6 -0
  698. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +298 -144
  699. package/deps/rocksdb/rocksdb/util/rate_limiter.h +68 -19
  700. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +335 -23
  701. package/deps/rocksdb/rocksdb/util/repeatable_thread.h +10 -12
  702. package/deps/rocksdb/rocksdb/util/repeatable_thread_test.cc +18 -15
  703. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +98 -74
  704. package/deps/rocksdb/rocksdb/util/ribbon_config.cc +506 -0
  705. package/deps/rocksdb/rocksdb/util/ribbon_config.h +182 -0
  706. package/deps/rocksdb/rocksdb/util/ribbon_impl.h +154 -79
  707. package/deps/rocksdb/rocksdb/util/ribbon_test.cc +742 -365
  708. package/deps/rocksdb/rocksdb/util/set_comparator.h +2 -0
  709. package/deps/rocksdb/rocksdb/util/slice.cc +198 -35
  710. package/deps/rocksdb/rocksdb/util/slice_test.cc +30 -1
  711. package/deps/rocksdb/rocksdb/util/status.cc +32 -29
  712. package/deps/rocksdb/rocksdb/util/stop_watch.h +18 -18
  713. package/deps/rocksdb/rocksdb/util/string_util.cc +85 -6
  714. package/deps/rocksdb/rocksdb/util/string_util.h +47 -2
  715. package/deps/rocksdb/rocksdb/util/thread_guard.h +41 -0
  716. package/deps/rocksdb/rocksdb/util/thread_local.h +2 -2
  717. package/deps/rocksdb/rocksdb/util/thread_local_test.cc +22 -24
  718. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +7 -6
  719. package/deps/rocksdb/rocksdb/util/timer.h +55 -46
  720. package/deps/rocksdb/rocksdb/util/timer_test.cc +50 -48
  721. package/deps/rocksdb/rocksdb/util/user_comparator_wrapper.h +4 -0
  722. package/deps/rocksdb/rocksdb/util/vector_iterator.h +31 -15
  723. package/deps/rocksdb/rocksdb/util/work_queue.h +2 -0
  724. package/deps/rocksdb/rocksdb/util/xxhash.cc +35 -1144
  725. package/deps/rocksdb/rocksdb/util/xxhash.h +5117 -373
  726. package/deps/rocksdb/rocksdb/util/xxph3.h +1762 -0
  727. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.cc +238 -0
  728. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.h +49 -0
  729. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge_test.cc +134 -0
  730. package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.cc +104 -0
  731. package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.h +47 -0
  732. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +3164 -0
  733. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_impl.h +29 -0
  734. package/deps/rocksdb/rocksdb/utilities/{backupable/backupable_db_test.cc → backup/backup_engine_test.cc} +1679 -485
  735. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +6 -4
  736. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +14 -9
  737. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +2 -0
  738. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +1 -0
  739. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +4 -0
  740. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +37 -27
  741. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +8 -4
  742. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +1 -1
  743. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h +13 -10
  744. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +5 -0
  745. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +44 -25
  746. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +3 -4
  747. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +27 -19
  748. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +4 -2
  749. package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +69 -0
  750. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +489 -0
  751. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +366 -0
  752. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc +67 -4
  753. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h +21 -6
  754. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +107 -7
  755. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h +43 -0
  756. package/deps/rocksdb/rocksdb/utilities/cassandra/format.h +1 -1
  757. package/deps/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc +24 -8
  758. package/deps/rocksdb/rocksdb/utilities/cassandra/merge_operator.h +7 -7
  759. package/deps/rocksdb/rocksdb/utilities/cassandra/serialize.h +5 -0
  760. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +99 -218
  761. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h +8 -24
  762. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +114 -1
  763. package/deps/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h +6 -2
  764. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -4
  765. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +7 -6
  766. package/deps/rocksdb/rocksdb/utilities/compaction_filters.cc +56 -0
  767. package/deps/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc +2 -2
  768. package/deps/rocksdb/rocksdb/utilities/counted_fs.cc +355 -0
  769. package/deps/rocksdb/rocksdb/utilities/counted_fs.h +152 -0
  770. package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +13 -0
  771. package/deps/rocksdb/rocksdb/utilities/env_timed.cc +164 -122
  772. package/deps/rocksdb/rocksdb/utilities/env_timed.h +97 -0
  773. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.cc +75 -17
  774. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +19 -3
  775. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +539 -126
  776. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +162 -17
  777. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +110 -0
  778. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +94 -0
  779. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +5 -2
  780. package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +104 -0
  781. package/deps/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h +5 -3
  782. package/deps/rocksdb/rocksdb/utilities/merge_operators/max.cc +4 -1
  783. package/deps/rocksdb/rocksdb/utilities/merge_operators/put.cc +11 -3
  784. package/deps/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc +0 -2
  785. package/deps/rocksdb/rocksdb/utilities/merge_operators/sortlist.h +5 -1
  786. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc +29 -10
  787. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h +6 -3
  788. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc +29 -14
  789. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h +6 -3
  790. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +71 -18
  791. package/deps/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc +15 -9
  792. package/deps/rocksdb/rocksdb/utilities/merge_operators.cc +120 -0
  793. package/deps/rocksdb/rocksdb/utilities/merge_operators.h +3 -23
  794. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +267 -42
  795. package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +702 -76
  796. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +1 -1
  797. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +26 -5
  798. package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +1 -1
  799. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +124 -1
  800. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +2 -3
  801. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +8 -9
  802. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +15 -13
  803. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +1 -1
  804. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +4 -4
  805. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +2 -2
  806. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc +8 -9
  807. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc +1 -1
  808. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +6 -3
  809. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +2 -2
  810. package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc +3 -0
  811. package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc +2 -0
  812. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +43 -35
  813. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +20 -18
  814. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +107 -2
  815. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +23 -15
  816. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h +2 -2
  817. package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +316 -0
  818. package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.h +86 -0
  819. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +4 -5
  820. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +4 -3
  821. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
  822. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +119 -3
  823. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc +20 -3
  824. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h +20 -0
  825. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h +3 -2
  826. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +4 -0
  827. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +38 -14
  828. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h +17 -10
  829. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +1 -0
  830. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +1 -2
  831. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +423 -34
  832. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +82 -2
  833. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +72 -40
  834. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +32 -1
  835. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +13 -5
  836. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +7 -3
  837. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +207 -43
  838. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +50 -7
  839. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +28 -10
  840. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +11 -6
  841. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +516 -0
  842. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +506 -15
  843. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +27 -13
  844. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +14 -14
  845. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +3 -0
  846. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +2 -2
  847. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +14 -5
  848. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +305 -27
  849. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +55 -159
  850. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +209 -2
  851. package/deps/rocksdb/rocksdb/utilities/wal_filter.cc +23 -0
  852. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +157 -88
  853. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +501 -114
  854. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +91 -316
  855. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +1212 -672
  856. package/deps/rocksdb/rocksdb.gyp +425 -446
  857. package/index.js +5 -87
  858. package/package-lock.json +23687 -0
  859. package/package.json +8 -9
  860. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  861. package/prebuilds/darwin-x64/node.napi.node +0 -0
  862. package/prebuilds/{darwin-x64+arm64 → linux-x64}/node.napi.node +0 -0
  863. package/deps/rocksdb/rocksdb/README.md +0 -32
  864. package/deps/rocksdb/rocksdb/env/env_hdfs.cc +0 -648
  865. package/deps/rocksdb/rocksdb/hdfs/README +0 -23
  866. package/deps/rocksdb/rocksdb/hdfs/env_hdfs.h +0 -386
  867. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h +0 -535
  868. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h +0 -175
  869. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/utility_db.h +0 -34
  870. package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator_test.cc +0 -102
  871. package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.h +0 -49
  872. package/deps/rocksdb/rocksdb/memtable/hash_skiplist_rep.h +0 -44
  873. package/deps/rocksdb/rocksdb/options/customizable_helper.h +0 -216
  874. package/deps/rocksdb/rocksdb/port/README +0 -10
  875. package/deps/rocksdb/rocksdb/third-party/folly/folly/CPortability.h +0 -27
  876. package/deps/rocksdb/rocksdb/third-party/folly/folly/ConstexprMath.h +0 -45
  877. package/deps/rocksdb/rocksdb/third-party/folly/folly/Indestructible.h +0 -166
  878. package/deps/rocksdb/rocksdb/third-party/folly/folly/Optional.h +0 -570
  879. package/deps/rocksdb/rocksdb/third-party/folly/folly/Portability.h +0 -92
  880. package/deps/rocksdb/rocksdb/third-party/folly/folly/ScopeGuard.h +0 -54
  881. package/deps/rocksdb/rocksdb/third-party/folly/folly/Traits.h +0 -152
  882. package/deps/rocksdb/rocksdb/third-party/folly/folly/Unit.h +0 -59
  883. package/deps/rocksdb/rocksdb/third-party/folly/folly/Utility.h +0 -141
  884. package/deps/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h +0 -33
  885. package/deps/rocksdb/rocksdb/third-party/folly/folly/container/Array.h +0 -74
  886. package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex-inl.h +0 -117
  887. package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp +0 -263
  888. package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.h +0 -96
  889. package/deps/rocksdb/rocksdb/third-party/folly/folly/functional/Invoke.h +0 -40
  890. package/deps/rocksdb/rocksdb/third-party/folly/folly/hash/Hash.h +0 -29
  891. package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h +0 -144
  892. package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Bits.h +0 -30
  893. package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Launder.h +0 -51
  894. package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/Asm.h +0 -28
  895. package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/SysSyscall.h +0 -10
  896. package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/SysTypes.h +0 -26
  897. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification-inl.h +0 -138
  898. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification.cpp +0 -23
  899. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification.h +0 -57
  900. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicUtil-inl.h +0 -260
  901. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicUtil.h +0 -52
  902. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h +0 -328
  903. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h +0 -1703
  904. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex.cpp +0 -16
  905. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex.h +0 -304
  906. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h +0 -39
  907. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/ParkingLot.cpp +0 -26
  908. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/ParkingLot.h +0 -318
  909. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/WaitOptions.h +0 -57
  910. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h +0 -219
  911. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h +0 -207
  912. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/ProxyLockable.h +0 -164
  913. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/Sleeper.h +0 -57
  914. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/Spin.h +0 -77
  915. package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp +0 -1145
  916. package/deps/rocksdb/rocksdb/util/build_version.h +0 -15
  917. package/deps/rocksdb/rocksdb/util/xxh3p.h +0 -1392
  918. package/deps/rocksdb/rocksdb/utilities/backupable/backupable_db.cc +0 -2354
  919. package/deps/rocksdb/rocksdb/utilities/env_librados.cc +0 -1497
  920. package/deps/rocksdb/rocksdb/utilities/env_librados_test.cc +0 -1146
  921. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README +0 -13
  922. package/deps/snappy/snappy-1.1.7/README.md +0 -149
  923. package/prebuilds/linux-x64/node.napi.glibc.node +0 -0
@@ -1,1703 +0,0 @@
1
- // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2
- // This source code is licensed under both the GPLv2 (found in the
3
- // COPYING file in the root directory) and Apache 2.0 License
4
- // (found in the LICENSE.Apache file in the root directory).
5
-
6
- #include <folly/synchronization/DistributedMutex.h>
7
-
8
- #include <folly/ConstexprMath.h>
9
- #include <folly/Portability.h>
10
- #include <folly/ScopeGuard.h>
11
- #include <folly/Utility.h>
12
- #include <folly/chrono/Hardware.h>
13
- #include <folly/detail/Futex.h>
14
- #include <folly/lang/Align.h>
15
- #include <folly/lang/Bits.h>
16
- #include <folly/portability/Asm.h>
17
- #include <folly/synchronization/AtomicNotification.h>
18
- #include <folly/synchronization/AtomicUtil.h>
19
- #include <folly/synchronization/detail/InlineFunctionRef.h>
20
- #include <folly/synchronization/detail/Sleeper.h>
21
-
22
- #include <array>
23
- #include <atomic>
24
- #include <cstddef>
25
- #include <cstdint>
26
- #include <limits>
27
- #include <stdexcept>
28
- #include <thread>
29
- #include <utility>
30
-
31
- namespace folly {
32
- namespace detail {
33
- namespace distributed_mutex {
34
- // kUnlocked is used to show unlocked state
35
- //
36
- // When locking threads encounter kUnlocked in the underlying storage, they
37
- // can just acquire the lock without any further effort
38
- constexpr auto kUnlocked = std::uintptr_t{0b0};
39
- // kLocked is used to show that the mutex is currently locked, and future
40
- // attempts to lock the mutex should enqueue on the central storage
41
- //
42
- // Locking threads find this on central storage only when there is a
43
- // contention chain that is undergoing wakeups, in every other case, a locker
44
- // will either find kUnlocked or an arbitrary address with the kLocked bit set
45
- constexpr auto kLocked = std::uintptr_t{0b1};
46
- // kTimedWaiter is set when there is at least one timed waiter on the mutex
47
- //
48
- // Timed waiters do not follow the sleeping strategy employed by regular,
49
- // non-timed threads. They sleep on the central mutex atomic through an
50
- // extended futex() interface that allows sleeping with the same semantics for
51
- // non-standard integer widths
52
- //
53
- // When a regular non-timed thread unlocks or enqueues on the mutex, and sees
54
- // a timed waiter, it takes ownership of all the timed waiters. The thread
55
- // that has taken ownership of the timed waiter releases the timed waiters
56
- // when it gets a chance at the critical section. At which point it issues a
57
- // wakeup to single timed waiter, timed waiters always issue wake() calls to
58
- // other timed waiters
59
- constexpr auto kTimedWaiter = std::uintptr_t{0b10};
60
-
61
- // kUninitialized means that the thread has just enqueued, and has not yet
62
- // gotten to initializing itself with the address of its successor
63
- //
64
- // this becomes significant for threads that are trying to wake up the
65
- // uninitialized thread, if they see that the thread is not yet initialized,
66
- // they can do nothing but spin, and wait for the thread to get initialized
67
- //
68
- // This also plays a role in the functioning of flat combining as implemented
69
- // in DistributedMutex. When a thread owning the lock goes through the
70
- // contention chain to either unlock the mutex or combine critical sections
71
- // from the other end. The presence of kUninitialized means that the
72
- // combining thread is not able to make progress after this point. So we
73
- // transfer the lock.
74
- constexpr auto kUninitialized = std::uint32_t{0b0};
75
- // kWaiting will be set in the waiter's futex structs while they are spinning
76
- // while waiting for the mutex
77
- constexpr auto kWaiting = std::uint32_t{0b1};
78
- // kWake will be set by threads that are waking up waiters that have enqueued
79
- constexpr auto kWake = std::uint32_t{0b10};
80
- // kSkipped will be set by a waker when they see that a waiter has been
81
- // preempted away by the kernel, in this case the thread that got skipped will
82
- // have to wake up and put itself back on the queue
83
- constexpr auto kSkipped = std::uint32_t{0b11};
84
- // kAboutToWait will be set by a waiter that enqueues itself with the purpose
85
- // of waiting on a futex
86
- constexpr auto kAboutToWait = std::uint32_t{0b100};
87
- // kSleeping will be set by a waiter right before enqueueing on a futex. When
88
- // a thread wants to wake up a waiter that has enqueued on a futex, it should
89
- // set the futex to contain kWake
90
- //
91
- // a thread that is unlocking and wants to skip over a sleeping thread also
92
- // calls futex_.exchange(kSleeping) on the sleeping thread's futex word. It
93
- // does this to 1. detect whether the sleeping thread had actually gone to
94
- // sleeping on the futex word so it can skip it, and 2. to synchronize with
95
- // other non atomic writes in the sleeping thread's context (such as the write
96
- // to track the next waiting thread).
97
- //
98
- // We reuse kSleeping instead of say using another constant kEarlyDelivery to
99
- // avoid situations where a thread has to enter kernel mode due to calling
100
- // futexWait() twice because of the presence of a waking thread. This
101
- // situation can arise when an unlocking thread goes to skip over a sleeping
102
- // thread, sees that the thread has slept and move on, but the sleeping thread
103
- // had not yet entered futex(). This interleaving causes the thread calling
104
- // futex() to return spuriously, as the futex word is not what it should be
105
- constexpr auto kSleeping = std::uint32_t{0b101};
106
- // kCombined is set by the lock holder to let the waiter thread know that its
107
- // combine request was successfully completed by the lock holder. A
108
- // successful combine means that the thread requesting the combine operation
109
- // does not need to unlock the mutex; in fact, doing so would be an error.
110
- constexpr auto kCombined = std::uint32_t{0b111};
111
- // kCombineUninitialized is like kUninitialized but is set by a thread when it
112
- // enqueues in hopes of getting its critical section combined with the lock
113
- // holder
114
- constexpr auto kCombineUninitialized = std::uint32_t{0b1000};
115
- // kCombineWaiting is set by a thread when it is ready to have its combine
116
- // record fulfilled by the lock holder. In particular, this signals to the
117
- // lock holder that the thread has set its next_ pointer in the contention
118
- // chain
119
- constexpr auto kCombineWaiting = std::uint32_t{0b1001};
120
- // kExceptionOccurred is set on the waiter futex when the remote task throws
121
- // an exception. It is the caller's responsibility to retrieve the exception
122
- // and rethrow it in their own context. Note that when the caller uses a
123
- // noexcept function as their critical section, they can avoid checking for
124
- // this value
125
- //
126
- // This allows us to avoid all cost of exceptions in the memory layout of the
127
- // fast path (no errors) as exceptions are stored as an std::exception_ptr in
128
- // the same union that stores the return value of the critical section. We
129
- // also avoid all CPU overhead because the combiner uses a try-catch block
130
- // without any additional branching to handle exceptions
131
- constexpr auto kExceptionOccurred = std::uint32_t{0b1010};
132
-
133
- // The number of spins that we are allowed to do before we resort to marking a
134
- // thread as having slept
135
- //
136
- // This is just a magic number from benchmarks
137
- constexpr auto kScheduledAwaySpinThreshold = std::chrono::nanoseconds{200};
138
- // The maximum number of spins before a thread starts yielding its processor
139
- // in hopes of getting skipped
140
- constexpr auto kMaxSpins = 4000;
141
- // The maximum number of contention chains we can resolve with flat combining.
142
- // After this number of contention chains, the mutex falls back to regular
143
- // two-phased mutual exclusion to ensure that we don't starve the combiner
144
- // thread
145
- constexpr auto kMaxCombineIterations = 2;
146
-
147
- /**
148
- * Write only data that is available to the thread that is waking up another.
149
- * Only the waking thread is allowed to write to this, the thread to be woken
150
- * is allowed to read from this after a wakeup has been issued
151
- */
152
- template <template <typename> class Atomic>
153
- class WakerMetadata {
154
- public:
155
- explicit WakerMetadata(
156
- std::uintptr_t waker = 0,
157
- std::uintptr_t waiters = 0,
158
- std::uint32_t sleeper = kUninitialized)
159
- : waker_{waker}, waiters_{waiters}, sleeper_{sleeper} {}
160
-
161
- // This is the thread that initiated wakeups for the contention chain.
162
- // There can only ever be one thread that initiates the wakeup for a
163
- // chain in the spin only version of this mutex. When a thread that just
164
- // woke up sees this as the next thread to wake up, it knows that it is the
165
- // terminal node in the contention chain. This means that it was the one
166
- // that took off the thread that had acquired the mutex off the centralized
167
- // state. Therefore, the current thread is the last in its contention
168
- // chain. It will fall back to centralized storage to pick up the next
169
- // waiter or release the mutex
170
- //
171
- // When we move to a full sleeping implementation, this might need to change
172
- // to a small_vector<> to account for failed wakeups, or we can put threads
173
- // to sleep on the central futex, which is an easier implementation
174
- // strategy. Although, since this is allocated on the stack, we can set a
175
- // prohitively large threshold to avoid heap allocations, this strategy
176
- // however, might cause increased cache misses on wakeup signalling
177
- std::uintptr_t waker_{0};
178
- // the list of threads that the waker had previously seen to be sleeping on
179
- // a futex(),
180
- //
181
- // this is given to the current thread as a means to pass on
182
- // information. When the current thread goes to unlock the mutex and does
183
- // not see contention, it should go and wake up the head of this list. If
184
- // the current thread sees a contention chain on the mutex, it should pass
185
- // on this list to the next thread that gets woken up
186
- std::uintptr_t waiters_{0};
187
- // The futex that this waiter will sleep on
188
- //
189
- // how can we reuse futex_ from above for futex management?
190
- Futex<Atomic> sleeper_{kUninitialized};
191
- };
192
-
193
- /**
194
- * Type of the type-erased callable that is used for combining from the lock
195
- * holder's end. This has 48 bytes of inline storage that can be used to
196
- * minimize cache misses when combining
197
- */
198
- using CombineFunction = detail::InlineFunctionRef<void(), 48>;
199
-
200
- /**
201
- * Waiter encapsulates the state required for waiting on the mutex, this
202
- * contains potentially heavy state and is intended to be allocated on the
203
- * stack as part of a lock() function call
204
- *
205
- * To ensure that synchronization does not cause unintended side effects on
206
- * the rest of the thread stack (eg. metadata in lockImplementation(), or any
207
- * other data in the user's thread), we aggresively pad this struct and use
208
- * custom alignment internally to ensure that the relevant data fits within a
209
- * single cacheline. The added alignment here also gives us some room to
210
- * wiggle in the bottom few bits of the mutex, where we store extra metadata
211
- */
212
- template <template <typename> class Atomic>
213
- class Waiter {
214
- public:
215
- Waiter() {}
216
- Waiter(Waiter&&) = delete;
217
- Waiter(const Waiter&) = delete;
218
- Waiter& operator=(Waiter&&) = delete;
219
- Waiter& operator=(const Waiter&) = delete;
220
-
221
- void initialize(std::uint64_t futex, CombineFunction task) {
222
- // we only initialize the function if we were actually given a non-null
223
- // task, otherwise
224
- if (task) {
225
- assert(futex == kCombineUninitialized);
226
- new (&function_) CombineFunction(task);
227
- } else {
228
- assert((futex == kUninitialized) || (futex == kAboutToWait));
229
- new (&metadata_) WakerMetadata<Atomic>{};
230
- }
231
-
232
- // this pedantic store is needed to ensure that the waking thread
233
- // synchronizes with the state in the waiter struct when it loads the
234
- // value of the futex word
235
- //
236
- // on x86, this gets optimized away to just a regular store, it might be
237
- // needed on platforms where explicit acquire-release barriers are
238
- // required for synchronization
239
- //
240
- // note that we release here at the end of the constructor because
241
- // construction is complete here, any thread that acquires this release
242
- // will see a well constructed wait node
243
- futex_.store(futex, std::memory_order_release);
244
- }
245
-
246
- std::array<std::uint8_t, hardware_destructive_interference_size> padding1;
247
- // the atomic that this thread will spin on while waiting for the mutex to
248
- // be unlocked
249
- alignas(hardware_destructive_interference_size) Atomic<std::uint64_t> futex_{
250
- kUninitialized};
251
- // The successor of this node. This will be the thread that had its address
252
- // on the mutex previously
253
- //
254
- // We can do without making this atomic since the remote thread synchronizes
255
- // on the futex variable above. If this were not atomic, the remote thread
256
- // would only be allowed to read from it after the waiter has moved into the
257
- // waiting state to avoid risk of a load racing with a write. However, it
258
- // helps to make this atomic because we can use an unconditional load and make
259
- // full use of the load buffer to coalesce both reads into a single clock
260
- // cycle after the line arrives in the combiner core. This is a heavily
261
- // contended line, so an RFO from the enqueueing thread is highly likely and
262
- // has the potential to cause an immediate invalidation; blocking the combiner
263
- // thread from making progress until the line is pulled back to read this
264
- // value
265
- //
266
- // Further, making this atomic prevents the compiler from making an incorrect
267
- // optimization where it does not load the value as written in the code, but
268
- // rather dereferences it through a pointer whenever needed (since the value
269
- // of the pointer to this is readily available on the stack). Doing this
270
- // causes multiple invalidation requests from the enqueueing thread, blocking
271
- // remote progress
272
- //
273
- // Note that we use relaxed loads and stores, so this should not have any
274
- // additional overhead compared to a regular load on most architectures
275
- std::atomic<std::uintptr_t> next_{0};
276
- // We use an anonymous union for the combined critical section request and
277
- // the metadata that will be filled in from the leader's end. Only one is
278
- // active at a time - if a leader decides to combine the requested critical
279
- // section into its execution, it will not touch the metadata field. If a
280
- // leader decides to migrate the lock to the waiter, it will not touch the
281
- // function
282
- //
283
- // this allows us to transfer more state when combining a critical section
284
- // and reduce the cache misses originating from executing an arbitrary
285
- // lambda
286
- //
287
- // note that this is an anonymous union, not an unnamed union, the members
288
- // leak into the surrounding scope
289
- union {
290
- // metadata for the waker
291
- WakerMetadata<Atomic> metadata_;
292
- // The critical section that can potentially be combined into the critical
293
- // section of the locking thread
294
- //
295
- // This is kept as a FunctionRef because the original function is preserved
296
- // until the lock_combine() function returns. A consequence of using
297
- // FunctionRef here is that we don't need to do any allocations and can
298
- // allow users to capture unbounded state into the critical section. Flat
299
- // combining means that the user does not have access to the thread
300
- // executing the critical section, so assumptions about thread local
301
- // references can be invalidated. Being able to capture arbitrary state
302
- // allows the user to do thread local accesses right before the critical
303
- // section and pass them as state to the callable being referenced here
304
- CombineFunction function_;
305
- // The user is allowed to use a combined critical section that returns a
306
- // value. This buffer is used to implement the value transfer to the
307
- // waiting thread. We reuse the same union because this helps us combine
308
- // one synchronization operation with a material value transfer.
309
- //
310
- // The waker thread needs to synchronize on this cacheline to issue a
311
- // wakeup to the waiter, meaning that the entire line needs to be pulled
312
- // into the remote core in exclusive mode. So we reuse the coherence
313
- // operation to transfer the return value in addition to the
314
- // synchronization signal. In the case that the user's data item is
315
- // small, the data is transferred all inline as part of the same line,
316
- // which pretty much arrives into the CPU cache in the same clock cycle or
317
- // two after a read-for-ownership request. This gives us a high chance of
318
- // coalescing the entire transitive store buffer together into one cache
319
- // coherence operation from the waker's end. This allows us to make use
320
- // of the CPU bus bandwidth which would have otherwise gone to waste.
321
- // Benchmarks prove this theory under a wide range of contention, value
322
- // sizes, NUMA interactions and processor models
323
- //
324
- // The current version of the Intel optimization manual confirms this
325
- // theory somewhat as well in section 2.3.5.1 (Load and Store Operation
326
- // Overview)
327
- //
328
- // When an instruction writes data to a memory location [...], the
329
- // processor ensures that it has the line containing this memory location
330
- // is in its L1d cache [...]. If the cache line is not there, it fetches
331
- // from the next levels using a RFO request [...] RFO and storing the
332
- // data happens after instruction retirement. Therefore, the store
333
- // latency usually does not affect the store instruction itself
334
- //
335
- // This gives the user the ability to input up to 48 bytes into the
336
- // combined critical section through an InlineFunctionRef and output 48
337
- // bytes from it basically without any cost. The type of the entity
338
- // stored in the buffer has to be matched by the type erased callable that
339
- // the caller has used. At this point, the caller is still in the
340
- // template instantiation leading to the combine request, so it has
341
- // knowledge of the return type and can apply the appropriate
342
- // reinterpret_cast and launder operation to safely retrieve the data from
343
- // this buffer
344
- _t<std::aligned_storage<48, 8>> storage_;
345
- };
346
- std::array<std::uint8_t, hardware_destructive_interference_size> padding2;
347
- };
348
-
349
- /**
350
- * A template that helps us differentiate between the different ways to return
351
- * a value from a combined critical section. A return value of type void
352
- * cannot be stored anywhere, so we use specializations and pick the right one
353
- * switched through std::conditional_t
354
- *
355
- * This is then used by CoalescedTask and its family of functions to implement
356
- * efficient return value transfers to the waiting threads
357
- */
358
- template <typename Func>
359
- class RequestWithReturn {
360
- public:
361
- using F = Func;
362
- using ReturnType = decltype(std::declval<const Func&>()());
363
- explicit RequestWithReturn(Func func) : func_{std::move(func)} {}
364
-
365
- /**
366
- * We need to define the destructor here because C++ requires (with good
367
- * reason) that a union with non-default destructor be explicitly destroyed
368
- * from the surrounding class, as neither the runtime nor compiler have the
369
- * knowledge of what to do with a union at the time of destruction
370
- *
371
- * Each request that has a valid return value set will have the value
372
- * retrieved from the get() method, where the value is destroyed. So we
373
- * don't need to destroy it here
374
- */
375
- ~RequestWithReturn() {}
376
-
377
- /**
378
- * This method can be used to return a value from the request. This returns
379
- * the underlying value because return type of the function we were
380
- * instantiated with is not void
381
- */
382
- ReturnType get() && {
383
- // when the return value has been processed, we destroy the value
384
- // contained in this request. Using a scope_exit means that we don't have
385
- // to worry about storing the value somewhere and causing potentially an
386
- // extra move
387
- //
388
- // note that the invariant here is that this function is only called if the
389
- // requesting thread had it's critical section combined, and the value_
390
- // member constructed through detach()
391
- SCOPE_EXIT {
392
- value_.~ReturnType();
393
- };
394
- return std::move(value_);
395
- }
396
-
397
- // this contains a copy of the function the waiter had requested to be
398
- // executed as a combined critical section
399
- Func func_;
400
- // this stores the return value used in the request, we use a union here to
401
- // avoid laundering and allow return types that are not default
402
- // constructible to be propagated through the execution of the critical
403
- // section
404
- //
405
- // note that this is an anonymous union, the member leaks into the
406
- // surrounding scope as a member variable
407
- union {
408
- ReturnType value_;
409
- };
410
- };
411
-
412
- template <typename Func>
413
- class RequestWithoutReturn {
414
- public:
415
- using F = Func;
416
- using ReturnType = void;
417
- explicit RequestWithoutReturn(Func func) : func_{std::move(func)} {}
418
-
419
- /**
420
- * In this version of the request class, get() returns nothing as there is
421
- * no stored value
422
- */
423
- void get() && {}
424
-
425
- // this contains a copy of the function the waiter had requested to be
426
- // executed as a combined critical section
427
- Func func_;
428
- };
429
-
430
- // we need to use std::integral_constant::value here as opposed to
431
- // std::integral_constant::operator T() because MSVC errors out with the
432
- // implicit conversion
433
- template <typename Func>
434
- using Request = _t<std::conditional<
435
- std::is_void<decltype(std::declval<const Func&>()())>::value,
436
- RequestWithoutReturn<Func>,
437
- RequestWithReturn<Func>>>;
438
-
439
- /**
440
- * A template that helps us to transform a callable returning a value to one
441
- * that returns void so it can be type erased and passed on to the waker. If
442
- * the return value is small enough, it gets coalesced into the wait struct
443
- * for optimal data transfer. When it's not small enough to fit in the waiter
444
- * storage buffer, we place it on it's own cacheline with isolation to prevent
445
- * false-sharing with the on-stack metadata of the waiter thread
446
- *
447
- * This helps a combined critical section feel more normal in the case where
448
- * the user wants to return a value, for example
449
- *
450
- * auto value = mutex_.lock_combine([&]() {
451
- * return data_.value();
452
- * });
453
- *
454
- * Without this, the user would typically create a dummy object that they
455
- * would then assign to from within the lambda. With return value chaining,
456
- * this pattern feels more natural
457
- *
458
- * Note that it is important to copy the entire callble into this class.
459
- * Storing something like a reference instead is not desirable because it does
460
- * not allow InlineFunctionRef to use inline storage to represent the user's
461
- * callable without extra indirections
462
- *
463
- * We use std::conditional_t and switch to the right type of task with the
464
- * CoalescedTask type alias
465
- */
466
- template <typename Func, typename Waiter>
467
- class TaskWithCoalesce {
468
- public:
469
- using ReturnType = decltype(std::declval<const Func&>()());
470
- using StorageType = folly::Unit;
471
- explicit TaskWithCoalesce(Func func, Waiter& waiter)
472
- : func_{std::move(func)}, waiter_(waiter) {}
473
-
474
- void operator()() const {
475
- auto value = func_();
476
- new (&waiter_.storage_) ReturnType(std::move(value));
477
- }
478
-
479
- private:
480
- Func func_;
481
- Waiter& waiter_;
482
-
483
- static_assert(!std::is_void<ReturnType>{}, "");
484
- static_assert(alignof(decltype(waiter_.storage_)) >= alignof(ReturnType), "");
485
- static_assert(sizeof(decltype(waiter_.storage_)) >= sizeof(ReturnType), "");
486
- };
487
-
488
- template <typename Func, typename Waiter>
489
- class TaskWithoutCoalesce {
490
- public:
491
- using ReturnType = void;
492
- using StorageType = folly::Unit;
493
- explicit TaskWithoutCoalesce(Func func, Waiter&) : func_{std::move(func)} {}
494
-
495
- void operator()() const {
496
- func_();
497
- }
498
-
499
- private:
500
- Func func_;
501
- };
502
-
503
- template <typename Func, typename Waiter>
504
- class TaskWithBigReturnValue {
505
- public:
506
- // Using storage that is aligned on the cacheline boundary helps us avoid a
507
- // situation where the data ends up being allocated on two separate
508
- // cachelines. This would require the remote thread to pull in both lines
509
- // to issue a write.
510
- //
511
- // We also isolate the storage by appending some padding to the end to
512
- // ensure we avoid false-sharing with the metadata used while the waiter
513
- // waits
514
- using ReturnType = decltype(std::declval<const Func&>()());
515
- static const auto kReturnValueAlignment = folly::kIsMsvc
516
- ? 8
517
- : folly::constexpr_max(
518
- alignof(ReturnType),
519
- folly::hardware_destructive_interference_size);
520
- using StorageType = _t<std::aligned_storage<
521
- sizeof(
522
- _t<std::aligned_storage<sizeof(ReturnType), kReturnValueAlignment>>),
523
- kReturnValueAlignment>>;
524
-
525
- explicit TaskWithBigReturnValue(Func func, Waiter&)
526
- : func_{std::move(func)} {}
527
-
528
- void operator()() const {
529
- assert(storage_);
530
- auto value = func_();
531
- new (storage_) ReturnType(std::move(value));
532
- }
533
-
534
- void attach(StorageType* storage) {
535
- assert(!storage_);
536
- storage_ = storage;
537
- }
538
-
539
- private:
540
- Func func_;
541
- StorageType* storage_{nullptr};
542
-
543
- static_assert(!std::is_void<ReturnType>{}, "");
544
- static_assert(sizeof(Waiter::storage_) < sizeof(ReturnType), "");
545
- };
546
-
547
- template <typename T, bool>
548
- struct Sizeof_;
549
- template <typename T>
550
- struct Sizeof_<T, false> : std::integral_constant<std::size_t, sizeof(T)> {};
551
- template <typename T>
552
- struct Sizeof_<T, true> : std::integral_constant<std::size_t, 0> {};
553
- template <typename T>
554
- struct Sizeof : Sizeof_<T, std::is_void<T>::value> {};
555
-
556
- // we need to use std::integral_constant::value here as opposed to
557
- // std::integral_constant::operator T() because MSVC errors out with the
558
- // implicit conversion
559
- template <typename Func, typename Waiter>
560
- using CoalescedTask = _t<std::conditional<
561
- std::is_void<decltype(std::declval<const Func&>()())>::value,
562
- TaskWithoutCoalesce<Func, Waiter>,
563
- _t<std::conditional<
564
- Sizeof<decltype(std::declval<const Func&>()())>::value <=
565
- sizeof(Waiter::storage_),
566
- TaskWithCoalesce<Func, Waiter>,
567
- TaskWithBigReturnValue<Func, Waiter>>>>>;
568
-
569
- /**
570
- * Given a request and a wait node, coalesce them into a CoalescedTask that
571
- * coalesces the return value into the wait node when invoked from a remote
572
- * thread
573
- *
574
- * When given a null request through nullptr_t, coalesce() returns null as well
575
- */
576
- template <typename Waiter>
577
- std::nullptr_t coalesce(std::nullptr_t&, Waiter&) {
578
- return nullptr;
579
- }
580
-
581
- template <
582
- typename Request,
583
- typename Waiter,
584
- typename Func = typename Request::F>
585
- CoalescedTask<Func, Waiter> coalesce(Request& request, Waiter& waiter) {
586
- static_assert(!std::is_same<Request, std::nullptr_t>{}, "");
587
- return CoalescedTask<Func, Waiter>{request.func_, waiter};
588
- }
589
-
590
- /**
591
- * Given a task, create storage for the return value. When we get a type
592
- * of CoalescedTask, this returns an instance of CoalescedTask::StorageType.
593
- * std::nullptr_t otherwise
594
- */
595
- inline std::nullptr_t makeReturnValueStorageFor(std::nullptr_t&) {
596
- return {};
597
- }
598
-
599
- template <
600
- typename CoalescedTask,
601
- typename StorageType = typename CoalescedTask::StorageType>
602
- StorageType makeReturnValueStorageFor(CoalescedTask&) {
603
- return {};
604
- }
605
-
606
- /**
607
- * Given a task and storage, attach them together if needed. This only helps
608
- * when we have a task that returns a value bigger than can be coalesced. In
609
- * that case, we need to attach the storage with the task so the return value
610
- * can be transferred to this thread from the remote thread
611
- */
612
- template <typename Task, typename Storage>
613
- void attach(Task&, Storage&) {
614
- static_assert(
615
- std::is_same<Storage, std::nullptr_t>{} ||
616
- std::is_same<Storage, folly::Unit>{},
617
- "");
618
- }
619
-
620
- template <
621
- typename R,
622
- typename W,
623
- typename StorageType = typename TaskWithBigReturnValue<R, W>::StorageType>
624
- void attach(TaskWithBigReturnValue<R, W>& task, StorageType& storage) {
625
- task.attach(&storage);
626
- }
627
-
628
- template <typename Request, typename Waiter>
629
- void throwIfExceptionOccurred(Request&, Waiter& waiter, bool exception) {
630
- using Storage = decltype(waiter.storage_);
631
- using F = typename Request::F;
632
- static_assert(sizeof(Storage) >= sizeof(std::exception_ptr), "");
633
- static_assert(alignof(Storage) >= alignof(std::exception_ptr), "");
634
-
635
- // we only need to check for an exception in the waiter struct if the passed
636
- // callable is not noexcept
637
- //
638
- // we need to make another instance of the exception with automatic storage
639
- // duration and destroy the exception held in the storage *before throwing* to
640
- // avoid leaks. If we don't destroy the exception_ptr in storage, the
641
- // refcount for the internal exception will never hit zero, thereby leaking
642
- // memory
643
- if ((!noexcept(std::declval<const F&>()()) && exception)) {
644
- auto storage = &waiter.storage_;
645
- auto exc = folly::launder(reinterpret_cast<std::exception_ptr*>(storage));
646
- auto copy = std::move(*exc);
647
- exc->std::exception_ptr::~exception_ptr();
648
- std::rethrow_exception(std::move(copy));
649
- }
650
- }
651
-
652
- /**
653
- * Given a CoalescedTask, a wait node and a request. Detach the return value
654
- * into the request from the wait node and task.
655
- */
656
- template <typename Waiter>
657
- void detach(std::nullptr_t&, Waiter&, bool exception, std::nullptr_t&) {
658
- assert(!exception);
659
- }
660
-
661
- template <typename Waiter, typename F>
662
- void detach(
663
- RequestWithoutReturn<F>& request,
664
- Waiter& waiter,
665
- bool exception,
666
- folly::Unit&) {
667
- throwIfExceptionOccurred(request, waiter, exception);
668
- }
669
-
670
- template <typename Waiter, typename F>
671
- void detach(
672
- RequestWithReturn<F>& request,
673
- Waiter& waiter,
674
- bool exception,
675
- folly::Unit&) {
676
- throwIfExceptionOccurred(request, waiter, exception);
677
-
678
- using ReturnType = typename RequestWithReturn<F>::ReturnType;
679
- static_assert(!std::is_same<ReturnType, void>{}, "");
680
- static_assert(sizeof(waiter.storage_) >= sizeof(ReturnType), "");
681
-
682
- auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&waiter.storage_));
683
- new (&request.value_) ReturnType(std::move(val));
684
- val.~ReturnType();
685
- }
686
-
687
- template <typename Waiter, typename F, typename Storage>
688
- void detach(
689
- RequestWithReturn<F>& request,
690
- Waiter& waiter,
691
- bool exception,
692
- Storage& storage) {
693
- throwIfExceptionOccurred(request, waiter, exception);
694
-
695
- using ReturnType = typename RequestWithReturn<F>::ReturnType;
696
- static_assert(!std::is_same<ReturnType, void>{}, "");
697
- static_assert(sizeof(storage) >= sizeof(ReturnType), "");
698
-
699
- auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&storage));
700
- new (&request.value_) ReturnType(std::move(val));
701
- val.~ReturnType();
702
- }
703
-
704
- /**
705
- * Get the time since epoch in nanoseconds
706
- *
707
- * This is faster than std::chrono::steady_clock because it avoids a VDSO
708
- * access to get the timestamp counter
709
- *
710
- * Note that the hardware timestamp counter on x86, like std::steady_clock is
711
- * guaranteed to be monotonically increasing -
712
- * https://c9x.me/x86/html/file_module_x86_id_278.html
713
- */
714
- inline std::chrono::nanoseconds time() {
715
- return std::chrono::nanoseconds{hardware_timestamp()};
716
- }
717
-
718
- /**
719
- * Zero out the other bits used by the implementation and return just an
720
- * address from a uintptr_t
721
- */
722
- template <typename Type>
723
- Type* extractPtr(std::uintptr_t from) {
724
- // shift one bit off the end, to get all 1s followed by a single 0
725
- auto mask = std::numeric_limits<std::uintptr_t>::max();
726
- mask >>= 1;
727
- mask <<= 1;
728
- assert(!(mask & 0b1));
729
-
730
- return folly::bit_cast<Type*>(from & mask);
731
- }
732
-
733
- /**
734
- * Strips the given nanoseconds into only the least significant 56 bits by
735
- * moving the least significant 56 bits over by 8 zeroing out the bottom 8
736
- * bits to be used as a medium of information transfer for the thread wait
737
- * nodes
738
- */
739
- inline std::uint64_t strip(std::chrono::nanoseconds t) {
740
- auto time = t.count();
741
- return static_cast<std::uint64_t>(time) << 8;
742
- }
743
-
744
- /**
745
- * Recover the timestamp value from an integer that has the timestamp encoded
746
- * in it
747
- */
748
- inline std::uint64_t recover(std::uint64_t from) {
749
- return from >> 8;
750
- }
751
-
752
- template <template <typename> class Atomic, bool TimePublishing>
753
- class DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy {
754
- public:
755
- // DistributedMutexStateProxy is move constructible and assignable for
756
- // convenience
757
- DistributedMutexStateProxy(DistributedMutexStateProxy&& other) {
758
- *this = std::move(other);
759
- }
760
-
761
- DistributedMutexStateProxy& operator=(DistributedMutexStateProxy&& other) {
762
- assert(!(*this));
763
-
764
- next_ = folly::exchange(other.next_, nullptr);
765
- expected_ = folly::exchange(other.expected_, 0);
766
- timedWaiters_ = folly::exchange(other.timedWaiters_, false);
767
- combined_ = folly::exchange(other.combined_, false);
768
- waker_ = folly::exchange(other.waker_, 0);
769
- waiters_ = folly::exchange(other.waiters_, nullptr);
770
- ready_ = folly::exchange(other.ready_, nullptr);
771
-
772
- return *this;
773
- }
774
-
775
- // The proxy is valid when a mutex acquisition attempt was successful,
776
- // lock() is guaranteed to return a valid proxy, try_lock() is not
777
- explicit operator bool() const {
778
- return expected_;
779
- }
780
-
781
- // private:
782
- // friend the mutex class, since that will be accessing state private to
783
- // this class
784
- friend class DistributedMutex<Atomic, TimePublishing>;
785
-
786
- DistributedMutexStateProxy(
787
- Waiter<Atomic>* next,
788
- std::uintptr_t expected,
789
- bool timedWaiter = false,
790
- bool combined = false,
791
- std::uintptr_t waker = 0,
792
- Waiter<Atomic>* waiters = nullptr,
793
- Waiter<Atomic>* ready = nullptr)
794
- : next_{next},
795
- expected_{expected},
796
- timedWaiters_{timedWaiter},
797
- combined_{combined},
798
- waker_{waker},
799
- waiters_{waiters},
800
- ready_{ready} {}
801
-
802
- // the next thread that is to be woken up, this being null at the time of
803
- // unlock() shows that the current thread acquired the mutex without
804
- // contention or it was the terminal thread in the queue of threads waking up
805
- Waiter<Atomic>* next_{nullptr};
806
- // this is the value that the current thread should expect to find on
807
- // unlock, and if this value is not there on unlock, the current thread
808
- // should assume that other threads are enqueued waiting for the mutex
809
- //
810
- // note that if the mutex has the same state set at unlock time, and this is
811
- // set to an address (and not say kLocked in the case of a terminal waker)
812
- // then it must have been the case that no other thread had enqueued itself,
813
- // since threads in the domain of this mutex do not share stack space
814
- //
815
- // if we want to support stack sharing, we can solve the problem by looping
816
- // at lock time, and setting a variable that says whether we have acquired
817
- // the lock or not perhaps
818
- std::uintptr_t expected_{0};
819
- // a boolean that will be set when the mutex has timed waiters that the
820
- // current thread is responsible for waking, in such a case, the current
821
- // thread will issue an atomic_notify_one() call after unlocking the mutex
822
- //
823
- // note that a timed waiter will itself always have this flag set. This is
824
- // done so we can avoid having to issue a atomic_notify_all() call (and
825
- // subsequently a thundering herd) when waking up timed-wait threads
826
- bool timedWaiters_{false};
827
- // a boolean that contains true if the state proxy is not meant to be passed
828
- // to the unlock() function. This is set only when there is contention and
829
- // a thread had asked for its critical section to be combined
830
- bool combined_{false};
831
- // metadata passed along from the thread that woke this thread up
832
- std::uintptr_t waker_{0};
833
- // the list of threads that are waiting on a futex
834
- //
835
- // the current threads is meant to wake up this list of waiters if it is
836
- // able to commit an unlock() on the mutex without seeing a contention chain
837
- Waiter<Atomic>* waiters_{nullptr};
838
- // after a thread has woken up from a futex() call, it will have the rest of
839
- // the threads that it were waiting behind it in this list, a thread that
840
- // unlocks has to wake up threads from this list if it has any, before it
841
- // goes to sleep to prevent pathological unfairness
842
- Waiter<Atomic>* ready_{nullptr};
843
- };
844
-
845
- template <template <typename> class Atomic, bool TimePublishing>
846
- DistributedMutex<Atomic, TimePublishing>::DistributedMutex()
847
- : state_{kUnlocked} {}
848
-
849
- template <typename Waiter>
850
- std::uint64_t publish(
851
- std::uint64_t spins,
852
- bool& shouldPublish,
853
- std::chrono::nanoseconds& previous,
854
- Waiter& waiter,
855
- std::uint32_t waitMode) {
856
- // time publishing has some overhead because it executes an atomic exchange on
857
- // the futex word. If this line is in a remote thread (eg. the combiner),
858
- // then each time we publish a timestamp, this thread has to submit an RFO to
859
- // the remote core for the cacheline, blocking progress for both threads.
860
- //
861
- // the remote core uses a store in the fast path - why then does an RFO make a
862
- // difference? The only educated guess we have here is that the added
863
- // roundtrip delays draining of the store buffer, which essentially exerts
864
- // backpressure on future stores, preventing parallelization
865
- //
866
- // if we have requested a combine, time publishing is less important as it
867
- // only comes into play when the combiner has exhausted their max combine
868
- // passes. So we defer time publishing to the point when the current thread
869
- // gets preempted
870
- auto current = time();
871
- if ((current - previous) >= kScheduledAwaySpinThreshold) {
872
- shouldPublish = true;
873
- }
874
- previous = current;
875
-
876
- // if we have requested a combine, and this is the first iteration of the
877
- // wait-loop, we publish a max timestamp to optimistically convey that we have
878
- // not yet been preempted (the remote knows the meaning of max timestamps)
879
- //
880
- // then if we are under the maximum number of spins allowed before sleeping,
881
- // we publish the exact timestamp, otherwise we publish the minimum possible
882
- // timestamp to force the waking thread to skip us
883
- auto now = ((waitMode == kCombineWaiting) && !spins)
884
- ? decltype(time())::max()
885
- : (spins < kMaxSpins) ? previous : decltype(time())::zero();
886
-
887
- // the wait mode information is published in the bottom 8 bits of the futex
888
- // word, the rest contains time information as computed above. Overflows are
889
- // not really a correctness concern because time publishing is only a
890
- // heuristic. This leaves us 56 bits of nanoseconds (2 years) before we hit
891
- // two consecutive wraparounds, so the lack of bits to respresent time is
892
- // neither a performance nor correctness concern
893
- auto data = strip(now) | waitMode;
894
- auto signal = (shouldPublish || !spins || (waitMode != kCombineWaiting))
895
- ? waiter.futex_.exchange(data, std::memory_order_acq_rel)
896
- : waiter.futex_.load(std::memory_order_acquire);
897
- return signal & std::numeric_limits<std::uint8_t>::max();
898
- }
899
-
900
- template <typename Waiter>
901
- bool spin(Waiter& waiter, std::uint32_t& sig, std::uint32_t mode) {
902
- auto spins = std::uint64_t{0};
903
- auto waitMode = (mode == kCombineUninitialized) ? kCombineWaiting : kWaiting;
904
- auto previous = time();
905
- auto shouldPublish = false;
906
- while (true) {
907
- auto signal = publish(spins++, shouldPublish, previous, waiter, waitMode);
908
-
909
- // if we got skipped, make a note of it and return if we got a skipped
910
- // signal or a signal to wake up
911
- auto skipped = (signal == kSkipped);
912
- auto combined = (signal == kCombined);
913
- auto exceptionOccurred = (signal == kExceptionOccurred);
914
- auto woken = (signal == kWake);
915
- if (skipped || woken || combined || exceptionOccurred) {
916
- sig = static_cast<std::uint32_t>(signal);
917
- return !skipped;
918
- }
919
-
920
- // if we are under the spin threshold, pause to allow the other
921
- // hyperthread to run. If not, then sleep
922
- if (spins < kMaxSpins) {
923
- asm_volatile_pause();
924
- } else {
925
- Sleeper::sleep();
926
- }
927
- }
928
- }
929
-
930
- template <typename Waiter>
931
- void doFutexWake(Waiter* waiter) {
932
- if (waiter) {
933
- // We can use a simple store operation here and not worry about checking
934
- // to see if the thread had actually started waiting on the futex, that is
935
- // already done in tryWake() when a sleeping thread is collected
936
- //
937
- // We now do not know whether the waiter had already enqueued on the futex
938
- // or whether it had just stored kSleeping in its futex and was about to
939
- // call futexWait(). We treat both these scenarios the same
940
- //
941
- // the below can theoretically cause a problem if we set the
942
- // wake signal and the waiter was in between setting kSleeping in its
943
- // futex and enqueueing on the futex. In this case the waiter will just
944
- // return from futexWait() immediately. This leaves the address that the
945
- // waiter was using for futexWait() possibly dangling, and the thread that
946
- // we woke in the exchange above might have used that address for some
947
- // other object
948
- //
949
- // however, even if the thread had indeed woken up simply becasue of the
950
- // above exchange(), the futexWake() below is not incorrect. It is not
951
- // incorrect because futexWake() does not actually change the memory of
952
- // the futex word. It just uses the address to do a lookup in the kernel
953
- // futex table. And even if we call futexWake() on some other address,
954
- // and that address was being used to wait on futex() that thread will
955
- // protect itself from spurious wakeups, check the value in the futex word
956
- // and enqueue itself back on the futex
957
- //
958
- // this dangilng pointer possibility is why we use a pointer to the futex
959
- // word, and avoid dereferencing after the store() operation
960
- auto sleeper = &waiter->metadata_.sleeper_;
961
- sleeper->store(kWake, std::memory_order_release);
962
- futexWake(sleeper, 1);
963
- }
964
- }
965
-
966
- template <typename Waiter>
967
- bool doFutexWait(Waiter* waiter, Waiter*& next) {
968
- // first we get ready to sleep by calling exchange() on the futex with a
969
- // kSleeping value
970
- assert(waiter->futex_.load(std::memory_order_relaxed) == kAboutToWait);
971
-
972
- // note the semantics of using a futex here, when we exchange the sleeper_
973
- // with kSleeping, we are getting ready to sleep, but before sleeping we get
974
- // ready to sleep, and we return from futexWait() when the value of
975
- // sleeper_ might have changed. We can also wake up because of a spurious
976
- // wakeup, so we always check against the value in sleeper_ after returning
977
- // from futexWait(), if the value is not kWake, then we continue
978
- auto pre =
979
- waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
980
-
981
- // Seeing a kSleeping on a futex word before we set it ourselves means only
982
- // one thing - an unlocking thread caught us before we went to futex(), and
983
- // we now have the lock, so we abort
984
- //
985
- // if we were given an early delivery, we can return from this function with
986
- // a true, meaning that we now have the lock
987
- if (pre == kSleeping) {
988
- return true;
989
- }
990
-
991
- // if we reach here then were were not given an early delivery, and any
992
- // thread that goes to wake us up will see a consistent view of the rest of
993
- // the contention chain (since the next_ variable is set before the
994
- // kSleeping exchange above)
995
- while (pre != kWake) {
996
- // before enqueueing on the futex, we wake any waiters that we were
997
- // possibly responsible for
998
- doFutexWake(folly::exchange(next, nullptr));
999
-
1000
- // then we wait on the futex
1001
- //
1002
- // note that we have to protect ourselves against spurious wakeups here.
1003
- // Because the corresponding futexWake() above does not synchronize
1004
- // wakeups around the futex word. Because doing so would become
1005
- // inefficient
1006
- futexWait(&waiter->metadata_.sleeper_, kSleeping);
1007
- pre = waiter->metadata_.sleeper_.load(std::memory_order_acquire);
1008
- assert((pre == kSleeping) || (pre == kWake));
1009
- }
1010
-
1011
- // when coming out of a futex, we might have some other sleeping threads
1012
- // that we were supposed to wake up, assign that to the next pointer
1013
- assert(next == nullptr);
1014
- next = extractPtr<Waiter>(waiter->next_.load(std::memory_order_relaxed));
1015
- return false;
1016
- }
1017
-
1018
- template <typename Waiter>
1019
- bool wait(Waiter* waiter, std::uint32_t mode, Waiter*& next, uint32_t& signal) {
1020
- if (mode == kAboutToWait) {
1021
- return doFutexWait(waiter, next);
1022
- }
1023
-
1024
- return spin(*waiter, signal, mode);
1025
- }
1026
-
1027
- inline void recordTimedWaiterAndClearTimedBit(
1028
- bool& timedWaiter,
1029
- std::uintptr_t& previous) {
1030
- // the previous value in the mutex can never be kTimedWaiter, timed waiters
1031
- // always set (kTimedWaiter | kLocked) in the mutex word when they try and
1032
- // acquire the mutex
1033
- assert(previous != kTimedWaiter);
1034
-
1035
- if ((previous & kTimedWaiter)) {
1036
- // record whether there was a timed waiter in the previous mutex state, and
1037
- // clear the timed bit from the previous state
1038
- timedWaiter = true;
1039
- previous = previous & (~kTimedWaiter);
1040
- }
1041
- }
1042
-
1043
- template <typename Atomic>
1044
- void wakeTimedWaiters(Atomic* state, bool timedWaiters) {
1045
- if ((timedWaiters)) {
1046
- atomic_notify_one(state);
1047
- }
1048
- }
1049
-
1050
- template <template <typename> class Atomic, bool TimePublishing>
1051
- template <typename Func>
1052
- auto DistributedMutex<Atomic, TimePublishing>::lock_combine(Func func)
1053
- -> decltype(std::declval<const Func&>()()) {
1054
- // invoke the lock implementation function and check whether we came out of
1055
- // it with our task executed as a combined critical section. This usually
1056
- // happens when the mutex is contended.
1057
- //
1058
- // In the absence of contention, we just return from the try_lock() function
1059
- // with the lock acquired. So we need to invoke the task and unlock
1060
- // the mutex before returning
1061
- auto&& task = Request<Func>{func};
1062
- auto&& state = lockImplementation(*this, state_, task);
1063
- if (!state.combined_) {
1064
- // to avoid having to play a return-value dance when the combinable
1065
- // returns void, we use a scope exit to perform the unlock after the
1066
- // function return has been processed
1067
- SCOPE_EXIT {
1068
- unlock(std::move(state));
1069
- };
1070
- return func();
1071
- }
1072
-
1073
- // if we are here, that means we were able to get our request combined, we
1074
- // can return the value that was transferred to us
1075
- //
1076
- // each thread that enqueues as a part of a contention chain takes up the
1077
- // responsibility of any timed waiter that had come immediately before it,
1078
- // so we wake up timed waiters before exiting the lock function. Another
1079
- // strategy might be to add the timed waiter information to the metadata and
1080
- // let a single leader wake up a timed waiter for better concurrency. But
1081
- // this has proven not to be useful in benchmarks beyond a small 5% delta,
1082
- // so we avoid taking the complexity hit and branch to wake up timed waiters
1083
- // from each thread
1084
- wakeTimedWaiters(&state_, state.timedWaiters_);
1085
- return std::move(task).get();
1086
- }
1087
-
1088
- template <template <typename> class Atomic, bool TimePublishing>
1089
- typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
1090
- DistributedMutex<Atomic, TimePublishing>::lock() {
1091
- auto null = nullptr;
1092
- return lockImplementation(*this, state_, null);
1093
- }
1094
-
1095
- template <typename Atomic, template <typename> class A, bool T>
1096
- auto tryLockNoLoad(Atomic& atomic, DistributedMutex<A, T>&)
1097
- -> typename DistributedMutex<A, T>::DistributedMutexStateProxy {
1098
- // Try and set the least significant bit of the centralized lock state to 1,
1099
- // if this succeeds, it must have been the case that we had a kUnlocked (or
1100
- // 0) in the central storage before, since that is the only case where a 0
1101
- // can be found in the least significant bit
1102
- //
1103
- // If this fails, then it is a no-op
1104
- using Proxy = typename DistributedMutex<A, T>::DistributedMutexStateProxy;
1105
- auto previous = atomic_fetch_set(atomic, 0, std::memory_order_acquire);
1106
- if (!previous) {
1107
- return Proxy{nullptr, kLocked};
1108
- }
1109
-
1110
- return Proxy{nullptr, 0};
1111
- }
1112
-
1113
- template <template <typename> class Atomic, bool TimePublishing>
1114
- typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
1115
- DistributedMutex<Atomic, TimePublishing>::try_lock() {
1116
- // The lock attempt below requires an expensive atomic fetch-and-mutate or
1117
- // an even more expensive atomic compare-and-swap loop depending on the
1118
- // platform. These operations require pulling the lock cacheline into the
1119
- // current core in exclusive mode and are therefore hard to parallelize
1120
- //
1121
- // This probabilistically avoids the expense by first checking whether the
1122
- // mutex is currently locked
1123
- if (state_.load(std::memory_order_relaxed) != kUnlocked) {
1124
- return DistributedMutexStateProxy{nullptr, 0};
1125
- }
1126
-
1127
- return tryLockNoLoad(state_, *this);
1128
- }
1129
-
1130
- template <
1131
- template <typename> class Atomic,
1132
- bool TimePublishing,
1133
- typename State,
1134
- typename Request>
1135
- typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
1136
- lockImplementation(
1137
- DistributedMutex<Atomic, TimePublishing>& mutex,
1138
- State& atomic,
1139
- Request& request) {
1140
- // first try and acquire the lock as a fast path, the underlying
1141
- // implementation is slightly faster than using std::atomic::exchange() as
1142
- // is used in this function. So we get a small perf boost in the
1143
- // uncontended case
1144
- //
1145
- // We only go through this fast path for the lock/unlock usage and avoid this
1146
- // for combined critical sections. This check adds unnecessary overhead in
1147
- // that case as it causes an extra cacheline bounce
1148
- constexpr auto combineRequested = !std::is_same<Request, std::nullptr_t>{};
1149
- if (!combineRequested) {
1150
- if (auto state = tryLockNoLoad(atomic, mutex)) {
1151
- return state;
1152
- }
1153
- }
1154
-
1155
- auto previous = std::uintptr_t{0};
1156
- auto waitMode = combineRequested ? kCombineUninitialized : kUninitialized;
1157
- auto nextWaitMode = kAboutToWait;
1158
- auto timedWaiter = false;
1159
- Waiter<Atomic>* nextSleeper = nullptr;
1160
- while (true) {
1161
- // construct the state needed to wait
1162
- //
1163
- // We can't use auto here because MSVC errors out due to a missing copy
1164
- // constructor
1165
- Waiter<Atomic> state{};
1166
- auto&& task = coalesce(request, state);
1167
- auto&& storage = makeReturnValueStorageFor(task);
1168
- auto&& address = folly::bit_cast<std::uintptr_t>(&state);
1169
- attach(task, storage);
1170
- state.initialize(waitMode, std::move(task));
1171
- assert(!(address & 0b1));
1172
-
1173
- // set the locked bit in the address we will be persisting in the mutex
1174
- address |= kLocked;
1175
-
1176
- // attempt to acquire the mutex, mutex acquisition is successful if the
1177
- // previous value is zeroed out
1178
- //
1179
- // we use memory_order_acq_rel here because we want the read-modify-write
1180
- // operation to be both acquire and release. Acquire becasue if this is a
1181
- // successful lock acquisition, we want to acquire state any other thread
1182
- // has released from a prior unlock. We want release semantics becasue
1183
- // other threads that read the address of this value should see the full
1184
- // well-initialized node we are going to wait on if the mutex acquisition
1185
- // was unsuccessful
1186
- previous = atomic.exchange(address, std::memory_order_acq_rel);
1187
- recordTimedWaiterAndClearTimedBit(timedWaiter, previous);
1188
- state.next_.store(previous, std::memory_order_relaxed);
1189
- if (previous == kUnlocked) {
1190
- return {/* next */ nullptr,
1191
- /* expected */ address,
1192
- /* timedWaiter */ timedWaiter,
1193
- /* combined */ false,
1194
- /* waker */ 0,
1195
- /* waiters */ nullptr,
1196
- /* ready */ nextSleeper};
1197
- }
1198
- assert(previous & kLocked);
1199
-
1200
- // wait until we get a signal from another thread, if this returns false,
1201
- // we got skipped and had probably been scheduled out, so try again
1202
- auto signal = kUninitialized;
1203
- if (!wait(&state, waitMode, nextSleeper, signal)) {
1204
- std::swap(waitMode, nextWaitMode);
1205
- continue;
1206
- }
1207
-
1208
- // at this point it is safe to access the other fields in the waiter state,
1209
- // since the thread that woke us up is gone and nobody will be touching this
1210
- // state again, note that this requires memory ordering, and this is why we
1211
- // use memory_order_acquire (among other reasons) in the above wait
1212
- //
1213
- // first we see if the value we took off the mutex state was the thread that
1214
- // initated the wakeups, if so, we are the terminal node of the current
1215
- // contention chain. If we are the terminal node, then we should expect to
1216
- // see a kLocked in the mutex state when we unlock, if we see that, we can
1217
- // commit the unlock to the centralized mutex state. If not, we need to
1218
- // continue wakeups
1219
- //
1220
- // a nice consequence of passing kLocked as the current address if we are
1221
- // the terminal node is that it naturally just works with the algorithm. If
1222
- // we get a contention chain when coming out of a contention chain, the tail
1223
- // of the new contention chain will have kLocked set as the previous, which,
1224
- // as it happens "just works", since we have now established a recursive
1225
- // relationship until broken
1226
- auto next = previous;
1227
- auto expected = address;
1228
- if (previous == state.metadata_.waker_) {
1229
- next = 0;
1230
- expected = kLocked;
1231
- }
1232
-
1233
- // if we were given a combine signal, detach the return value from the
1234
- // wait struct into the request, so the current thread can access it
1235
- // outside this function
1236
- auto combined = (signal == kCombined);
1237
- auto exceptionOccurred = (signal == kExceptionOccurred);
1238
- if (combined || exceptionOccurred) {
1239
- detach(request, state, exceptionOccurred, storage);
1240
- }
1241
-
1242
- // if we are just coming out of a futex call, then it means that the next
1243
- // waiter we are responsible for is also a waiter waiting on a futex, so
1244
- // we return that list in the list of ready threads. We wlil be waking up
1245
- // the ready threads on unlock no matter what
1246
- return {/* next */ extractPtr<Waiter<Atomic>>(next),
1247
- /* expected */ expected,
1248
- /* timedWaiter */ timedWaiter,
1249
- /* combined */ combineRequested && (combined || exceptionOccurred),
1250
- /* waker */ state.metadata_.waker_,
1251
- /* waiters */ extractPtr<Waiter<Atomic>>(state.metadata_.waiters_),
1252
- /* ready */ nextSleeper};
1253
- }
1254
- }
1255
-
1256
- inline bool preempted(std::uint64_t value, std::chrono::nanoseconds now) {
1257
- auto currentTime = recover(strip(now));
1258
- auto nodeTime = recover(value);
1259
- auto preempted =
1260
- (currentTime > nodeTime + kScheduledAwaySpinThreshold.count()) &&
1261
- (nodeTime != recover(strip(std::chrono::nanoseconds::max())));
1262
-
1263
- // we say that the thread has been preempted if its timestamp says so, and
1264
- // also if it is neither uninitialized nor skipped
1265
- assert(value != kSkipped);
1266
- return (preempted) && (value != kUninitialized) &&
1267
- (value != kCombineUninitialized);
1268
- }
1269
-
1270
- inline bool isSleeper(std::uintptr_t value) {
1271
- return (value == kAboutToWait);
1272
- }
1273
-
1274
- inline bool isInitialized(std::uintptr_t value) {
1275
- return (value != kUninitialized) && (value != kCombineUninitialized);
1276
- }
1277
-
1278
- inline bool isCombiner(std::uintptr_t value) {
1279
- auto mode = (value & 0xff);
1280
- return (mode == kCombineWaiting) || (mode == kCombineUninitialized);
1281
- }
1282
-
1283
- inline bool isWaitingCombiner(std::uintptr_t value) {
1284
- return (value & 0xff) == kCombineWaiting;
1285
- }
1286
-
1287
- template <typename Waiter>
1288
- CombineFunction loadTask(Waiter* current, std::uintptr_t value) {
1289
- // if we know that the waiter is a combiner of some sort, it is safe to read
1290
- // and copy the value of the function in the waiter struct, since we know
1291
- // that a waiter would have set it before enqueueing
1292
- if (isCombiner(value)) {
1293
- return current->function_;
1294
- }
1295
-
1296
- return nullptr;
1297
- }
1298
-
1299
- template <typename Waiter>
1300
- void transferCurrentException(Waiter* waiter) {
1301
- assert(std::current_exception());
1302
- new (&waiter->storage_) std::exception_ptr(std::current_exception());
1303
- waiter->futex_.store(kExceptionOccurred, std::memory_order_release);
1304
- }
1305
-
1306
- template <template <typename> class Atomic>
1307
- inline std::uintptr_t tryCombine(
1308
- Waiter<Atomic>* waiter,
1309
- std::uintptr_t value,
1310
- std::uintptr_t next,
1311
- std::uint64_t iteration,
1312
- std::chrono::nanoseconds now,
1313
- CombineFunction task) {
1314
- #ifndef ROCKSDB_LITE
1315
- // if the waiter has asked for a combine operation, we should combine its
1316
- // critical section and move on to the next waiter
1317
- //
1318
- // the waiter is combinable if the following conditions are satisfied
1319
- //
1320
- // 1) the state in the futex word is not uninitialized (kUninitialized)
1321
- // 2) it has a valid combine function
1322
- // 3) we are not past the limit of the number of combines we can perform
1323
- // or the waiter thread been preempted. If the waiter gets preempted,
1324
- // its better to just execute their critical section before moving on.
1325
- // As they will have to re-queue themselves after preemption anyway,
1326
- // leading to further delays in critical section completion
1327
- //
1328
- // if all the above are satisfied, then we can combine the critical section.
1329
- // Note that if the waiter is in a combineable state, that means that it had
1330
- // finished its writes to both the task and the next_ value. And observing
1331
- // a waiting state also means that we have acquired the writes to the other
1332
- // members of the waiter struct, so it's fine to use those values here
1333
- if (isWaitingCombiner(value) &&
1334
- (iteration <= kMaxCombineIterations || preempted(value, now))) {
1335
- try {
1336
- task();
1337
- waiter->futex_.store(kCombined, std::memory_order_release);
1338
- } catch (...) {
1339
- transferCurrentException(waiter);
1340
- }
1341
- return next;
1342
- }
1343
- #endif // ROCKSDB_LITE
1344
- return 0;
1345
- }
1346
-
1347
- template <typename Waiter>
1348
- inline std::uintptr_t tryWake(
1349
- bool publishing,
1350
- Waiter* waiter,
1351
- std::uintptr_t value,
1352
- std::uintptr_t next,
1353
- std::uintptr_t waker,
1354
- Waiter*& sleepers,
1355
- std::uint64_t iteration,
1356
- CombineFunction task) {
1357
- // try and combine the waiter's request first, if that succeeds that means
1358
- // we have successfully executed their critical section and can move on to
1359
- // the rest of the chain
1360
- auto now = time();
1361
- if (tryCombine(waiter, value, next, iteration, now, task)) {
1362
- return next;
1363
- }
1364
-
1365
- // first we see if we can wake the current thread that is spinning
1366
- if ((!publishing || !preempted(value, now)) && !isSleeper(value)) {
1367
- // the Metadata class should be trivially destructible as we use placement
1368
- // new to set the relevant metadata without calling any destructor. We
1369
- // need to use placement new because the class contains a futex, which is
1370
- // non-movable and non-copyable
1371
- using Metadata = _t<std::decay<decltype(waiter->metadata_)>>;
1372
- static_assert(std::is_trivially_destructible<Metadata>{}, "");
1373
-
1374
- // we need release here because of the write to waker_ and also because we
1375
- // are unlocking the mutex, the thread we do the handoff to here should
1376
- // see the modified data
1377
- new (&waiter->metadata_) Metadata(waker, bit_cast<uintptr_t>(sleepers));
1378
- waiter->futex_.store(kWake, std::memory_order_release);
1379
- return 0;
1380
- }
1381
-
1382
- // if the thread is not a sleeper, and we were not able to catch it before
1383
- // preemption, we can just return a false, it is safe to read next_ because
1384
- // the thread was preempted. Preemption signals can only come after the
1385
- // thread has set the next_ pointer, since the timestamp writes only start
1386
- // occurring after that point
1387
- //
1388
- // if a thread was preempted it must have stored next_ in the waiter struct,
1389
- // as the store to futex_ that resets the value from kUninitialized happens
1390
- // after the write to next
1391
- assert(publishing);
1392
- if (!isSleeper(value)) {
1393
- // go on to the next one
1394
- //
1395
- // Also, we need a memory_order_release here to prevent missed wakeups. A
1396
- // missed wakeup here can happen when we see that a thread had been
1397
- // preempted and skip it. Then go on to release the lock, and then when
1398
- // the thread which got skipped does an exchange on the central storage,
1399
- // still sees the locked bit, and never gets woken up
1400
- //
1401
- // Can we relax this?
1402
- assert(preempted(value, now));
1403
- assert(!isCombiner(value));
1404
- next = waiter->next_.load(std::memory_order_relaxed);
1405
- waiter->futex_.store(kSkipped, std::memory_order_release);
1406
- return next;
1407
- }
1408
-
1409
- // if we are here the thread is a sleeper
1410
- //
1411
- // we attempt to catch the thread before it goes to futex(). If we are able
1412
- // to catch the thread before it sleeps on a futex, we are done, and don't
1413
- // need to go any further
1414
- //
1415
- // if we are not able to catch the thread before it goes to futex, we
1416
- // collect the current thread in the list of sleeping threads represented by
1417
- // sleepers, and return the next thread in the list and return false along
1418
- // with the previous next value
1419
- //
1420
- // it is safe to read the next_ pointer in the waiter struct if we were
1421
- // unable to catch the thread before it went to futex() because we use
1422
- // acquire-release ordering for the exchange operation below. And if we see
1423
- // that the thread was already sleeping, we have synchronized with the write
1424
- // to next_ in the context of the sleeping thread
1425
- //
1426
- // Also we need to set the value of waiters_ and waker_ in the thread before
1427
- // doing the exchange because we need to pass on the list of sleepers in the
1428
- // event that we were able to catch the thread before it went to futex().
1429
- // If we were unable to catch the thread before it slept, these fields will
1430
- // be ignored when the thread wakes up anyway
1431
- assert(isSleeper(value));
1432
- waiter->metadata_.waker_ = waker;
1433
- waiter->metadata_.waiters_ = folly::bit_cast<std::uintptr_t>(sleepers);
1434
- auto pre =
1435
- waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
1436
-
1437
- // we were able to catch the thread before it went to sleep, return true
1438
- if (pre != kSleeping) {
1439
- return 0;
1440
- }
1441
-
1442
- // otherwise return false, with the value of next_, it is safe to read next
1443
- // because of the same logic as when a thread was preempted
1444
- //
1445
- // we also need to collect this sleeper in the list of sleepers being built
1446
- // up
1447
- next = waiter->next_.load(std::memory_order_relaxed);
1448
- auto head = folly::bit_cast<std::uintptr_t>(sleepers);
1449
- waiter->next_.store(head, std::memory_order_relaxed);
1450
- sleepers = waiter;
1451
- return next;
1452
- }
1453
-
1454
- template <typename Waiter>
1455
- bool wake(
1456
- bool publishing,
1457
- Waiter& waiter,
1458
- std::uintptr_t waker,
1459
- Waiter*& sleepers,
1460
- std::uint64_t iter) {
1461
- // loop till we find a node that is either at the end of the list (as
1462
- // specified by waker) or we find a node that is active (as specified by
1463
- // the last published timestamp of the node)
1464
- auto current = &waiter;
1465
- while (current) {
1466
- // it is important that we load the value of function and next_ after the
1467
- // initial acquire load. This is required because we need to synchronize
1468
- // with the construction of the waiter struct before reading from it
1469
- //
1470
- // the load from the next_ variable is an optimistic load that assumes
1471
- // that the waiting thread has probably gone to the waiting state. If the
1472
- // waiitng thread is in the waiting state (as revealed by the acquire load
1473
- // from the futex word), we will see a well formed next_ value because it
1474
- // happens-before the release store to the futex word. The atomic load from
1475
- // next_ is an optimization to avoid branching before loading and prevent
1476
- // the compiler from eliding the load altogether (and using a pointer
1477
- // dereference when needed)
1478
- auto value = current->futex_.load(std::memory_order_acquire);
1479
- auto next = current->next_.load(std::memory_order_relaxed);
1480
- auto task = loadTask(current, value);
1481
- next =
1482
- tryWake(publishing, current, value, next, waker, sleepers, iter, task);
1483
-
1484
- // if there is no next node, we have managed to wake someone up and have
1485
- // successfully migrated the lock to another thread
1486
- if (!next) {
1487
- return true;
1488
- }
1489
-
1490
- // we need to read the value of the next node in the list before skipping
1491
- // it, this is because after we skip it the node might wake up and enqueue
1492
- // itself, and thereby gain a new next node
1493
- assert(publishing);
1494
- current = (next == waker) ? nullptr : extractPtr<Waiter>(next);
1495
- }
1496
-
1497
- return false;
1498
- }
1499
-
1500
- template <typename Atomic, typename Proxy, typename Sleepers>
1501
- bool tryUnlockClean(Atomic& state, Proxy& proxy, Sleepers sleepers) {
1502
- auto expected = proxy.expected_;
1503
- while (true) {
1504
- if (state.compare_exchange_strong(
1505
- expected,
1506
- kUnlocked,
1507
- std::memory_order_release,
1508
- std::memory_order_relaxed)) {
1509
- // if we were able to commit an unlocked, we need to wake up the futex
1510
- // waiters, if any
1511
- doFutexWake(sleepers);
1512
- return true;
1513
- }
1514
-
1515
- // if we failed the compare_exchange_strong() above, we check to see if
1516
- // the failure was because of the presence of a timed waiter. If that
1517
- // was the case then we try one more time with the kTimedWaiter bit set
1518
- if (expected == (proxy.expected_ | kTimedWaiter)) {
1519
- proxy.timedWaiters_ = true;
1520
- continue;
1521
- }
1522
-
1523
- // otherwise break, we have a contention chain
1524
- return false;
1525
- }
1526
- }
1527
-
1528
- template <template <typename> class Atomic, bool Publish>
1529
- void DistributedMutex<Atomic, Publish>::unlock(
1530
- typename DistributedMutex::DistributedMutexStateProxy proxy) {
1531
- // we always wake up ready threads and timed waiters if we saw either
1532
- assert(proxy);
1533
- assert(!proxy.combined_);
1534
- SCOPE_EXIT {
1535
- doFutexWake(proxy.ready_);
1536
- wakeTimedWaiters(&state_, proxy.timedWaiters_);
1537
- };
1538
-
1539
- // if there is a wait queue we are responsible for, try and start wakeups,
1540
- // don't bother with the mutex state
1541
- auto sleepers = proxy.waiters_;
1542
- if (proxy.next_) {
1543
- if (wake(Publish, *proxy.next_, proxy.waker_, sleepers, 0)) {
1544
- return;
1545
- }
1546
-
1547
- // At this point, if are in the if statement, we were not the terminal
1548
- // node of the wakeup chain. Terminal nodes have the next_ pointer set to
1549
- // null in lock()
1550
- //
1551
- // So we need to pretend we were the end of the contention chain. Coming
1552
- // out of a contention chain always has the kLocked state set in the
1553
- // mutex. Unless there is another contention chain lined up, which does
1554
- // not matter since we are the terminal node anyway
1555
- proxy.expected_ = kLocked;
1556
- }
1557
-
1558
- for (std::uint64_t i = 0; true; ++i) {
1559
- // otherwise, since we don't have anyone we need to wake up, we try and
1560
- // release the mutex just as is
1561
- //
1562
- // if this is successful, we can return, the unlock was successful, we have
1563
- // committed a nice kUnlocked to the central storage, yay
1564
- if (tryUnlockClean(state_, proxy, sleepers)) {
1565
- return;
1566
- }
1567
-
1568
- // here we have a contention chain built up on the mutex. We grab the
1569
- // wait queue and start executing wakeups. We leave a locked bit on the
1570
- // centralized storage and handoff control to the head of the queue
1571
- //
1572
- // we use memory_order_acq_rel here because we want to see the
1573
- // full well-initialized node that the other thread is waiting on
1574
- //
1575
- // If we are unable to wake the contention chain, it is possible that when
1576
- // we come back to looping here, a new contention chain will form. In
1577
- // that case we need to use kLocked as the waker_ value because the
1578
- // terminal node of the new chain will see kLocked in the central storage
1579
- auto head = state_.exchange(kLocked, std::memory_order_acq_rel);
1580
- recordTimedWaiterAndClearTimedBit(proxy.timedWaiters_, head);
1581
- auto next = extractPtr<Waiter<Atomic>>(head);
1582
- auto expected = folly::exchange(proxy.expected_, kLocked);
1583
- assert((head & kLocked) && (head != kLocked));
1584
- if (wake(Publish, *next, expected, sleepers, i)) {
1585
- break;
1586
- }
1587
- }
1588
- }
1589
-
1590
- template <typename Atomic, typename Deadline, typename MakeProxy>
1591
- auto timedLock(Atomic& state, Deadline deadline, MakeProxy proxy)
1592
- -> decltype(std::declval<MakeProxy&>()(nullptr, kLocked, true)) {
1593
- while (true) {
1594
- // we put a bit on the central state to show that there is a timed waiter
1595
- // and go to sleep on the central state
1596
- //
1597
- // when this thread goes to unlock the mutex, it will expect a 0b1 in the
1598
- // mutex state (0b1, not 0b11), but then it will see that the value in the
1599
- // mutex state is 0b11 and not 0b1, meaning that there might have been
1600
- // another timed waiter. Even though there might not have been another
1601
- // timed waiter in the time being. This sort of missed wakeup is
1602
- // desirable for timed waiters; it helps avoid thundering herds of timed
1603
- // waiters. Because the mutex is packed in 8 bytes, and we need an
1604
- // address to be stored in those 8 bytes, we don't have much room to play
1605
- // with. The only other solution is to issue a futexWake(INT_MAX) to wake
1606
- // up all waiters when a clean unlock is committed, when a thread saw a
1607
- // timed waiter in the mutex previously.
1608
- //
1609
- // putting a 0b11 here works for a set of reasons that is a superset of
1610
- // the set of reasons that make it okay to put a kLocked (0b1) in the
1611
- // mutex state. Now that the thread has put (kTimedWaiter | kLocked)
1612
- // (0b11) in the mutex state and it expects a kLocked (0b1), there are two
1613
- // scenarios possible. The first being when there is no contention chain
1614
- // formation in the mutex from the time a timed waiter got a lock to
1615
- // unlock. In this case, the unlocker sees a 0b11 in the mutex state,
1616
- // adjusts to the presence of a timed waiter and cleanly unlocks with a
1617
- // kUnlocked (0b0). The second is when there is a contention chain.
1618
- // When a thread puts its address in the mutex and sees the timed bit, it
1619
- // records the presence of a timed waiter, and then pretends as if it
1620
- // hadn't seen the timed bit. So future contention chain releases, will
1621
- // terminate with a kLocked (0b1) and not a (kLocked | kTimedWaiter)
1622
- // (0b11). This just works naturally with the rest of the algorithm
1623
- // without incurring a perf hit for the regular non-timed case
1624
- //
1625
- // this strategy does however mean, that when threads try to acquire the
1626
- // mutex and all time out, there will be a wasteful syscall to issue wakeups
1627
- // to waiting threads. We don't do anything to try and minimize this
1628
- //
1629
- // we need to use a fetch_or() here because we need to convey two bits of
1630
- // information - 1, whether the mutex is locked or not, and 2, whether
1631
- // there is a timed waiter. The alternative here is to use the second bit
1632
- // to convey information only, we can use a fetch_set() on the second bit
1633
- // to make this faster, but that comes at the expense of requiring regular
1634
- // fast path lock attempts. Which use a single bit read-modify-write for
1635
- // better performance
1636
- auto data = kTimedWaiter | kLocked;
1637
- auto previous = state.fetch_or(data, std::memory_order_acquire);
1638
- if (!(previous & 0b1)) {
1639
- assert(!previous);
1640
- return proxy(nullptr, kLocked, true);
1641
- }
1642
-
1643
- // wait on the futex until signalled, if we get a timeout, the try_lock
1644
- // fails
1645
- auto result = atomic_wait_until(&state, previous | data, deadline);
1646
- if (result == std::cv_status::timeout) {
1647
- return proxy(nullptr, std::uintptr_t{0}, false);
1648
- }
1649
- }
1650
- }
1651
-
1652
- template <template <typename> class Atomic, bool TimePublishing>
1653
- template <typename Clock, typename Duration>
1654
- typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
1655
- DistributedMutex<Atomic, TimePublishing>::try_lock_until(
1656
- const std::chrono::time_point<Clock, Duration>& deadline) {
1657
- // fast path for the uncontended case
1658
- //
1659
- // we get the time after trying to acquire the mutex because in the
1660
- // uncontended case, the price of getting the time is about 1/3 of the
1661
- // actual mutex acquisition. So we only pay the price of that extra bit of
1662
- // latency when needed
1663
- //
1664
- // this is even higher when VDSO is involved on architectures that do not
1665
- // offer a direct interface to the timestamp counter
1666
- if (auto state = try_lock()) {
1667
- return state;
1668
- }
1669
-
1670
- // fall back to the timed locking algorithm
1671
- using Proxy = DistributedMutexStateProxy;
1672
- return timedLock(
1673
- state_,
1674
- deadline,
1675
- [](Waiter<Atomic>* next, std::uintptr_t expected, bool timedWaiter) {
1676
- return Proxy{next, expected, timedWaiter};
1677
- });
1678
- }
1679
-
1680
- template <template <typename> class Atomic, bool TimePublishing>
1681
- template <typename Rep, typename Period>
1682
- typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
1683
- DistributedMutex<Atomic, TimePublishing>::try_lock_for(
1684
- const std::chrono::duration<Rep, Period>& duration) {
1685
- // fast path for the uncontended case. Reasoning for doing this here is the
1686
- // same as in try_lock_until()
1687
- if (auto state = try_lock()) {
1688
- return state;
1689
- }
1690
-
1691
- // fall back to the timed locking algorithm
1692
- using Proxy = DistributedMutexStateProxy;
1693
- auto deadline = std::chrono::steady_clock::now() + duration;
1694
- return timedLock(
1695
- state_,
1696
- deadline,
1697
- [](Waiter<Atomic>* next, std::uintptr_t expected, bool timedWaiter) {
1698
- return Proxy{next, expected, timedWaiter};
1699
- });
1700
- }
1701
- } // namespace distributed_mutex
1702
- } // namespace detail
1703
- } // namespace folly