@nxtedition/rocksdb 8.2.8 → 9.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (483) hide show
  1. package/binding.cc +0 -21
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +20 -10
  3. package/deps/rocksdb/rocksdb/Makefile +37 -25
  4. package/deps/rocksdb/rocksdb/README.md +29 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +25 -2
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +35 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +229 -74
  8. package/deps/rocksdb/rocksdb/cache/cache_helpers.cc +2 -1
  9. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +4 -3
  10. package/deps/rocksdb/rocksdb/cache/cache_test.cc +58 -95
  11. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +4 -2
  12. package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -3
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +2683 -496
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.h +580 -159
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +145 -42
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +20 -1
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +391 -17
  18. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +7 -5
  19. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +309 -212
  20. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +0 -32
  21. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +439 -12
  22. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +44 -2
  23. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +11 -1
  24. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +16 -3
  25. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache.cc +119 -0
  26. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache.h +155 -0
  27. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +711 -0
  28. package/deps/rocksdb/rocksdb/cache/typed_cache.h +17 -11
  29. package/deps/rocksdb/rocksdb/crash_test.mk +14 -0
  30. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +28 -12
  31. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +1 -0
  32. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +2 -1
  33. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -1
  34. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -1
  35. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
  36. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +1 -1
  37. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +20 -22
  38. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +1 -2
  39. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +1 -1
  40. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +2 -3
  41. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +1 -1
  42. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +8 -0
  43. package/deps/rocksdb/rocksdb/db/blob/db_blob_index_test.cc +7 -3
  44. package/deps/rocksdb/rocksdb/db/builder.cc +35 -10
  45. package/deps/rocksdb/rocksdb/db/c.cc +233 -6
  46. package/deps/rocksdb/rocksdb/db/c_test.c +140 -6
  47. package/deps/rocksdb/rocksdb/db/column_family.cc +110 -51
  48. package/deps/rocksdb/rocksdb/db/column_family.h +34 -2
  49. package/deps/rocksdb/rocksdb/db/column_family_test.cc +314 -7
  50. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +106 -23
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +47 -9
  53. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +10 -11
  54. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -6
  55. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +2 -2
  56. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +148 -60
  57. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +22 -7
  58. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -0
  59. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -4
  60. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +33 -23
  61. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +14 -5
  62. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -11
  63. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +3 -0
  64. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +90 -4
  65. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +170 -95
  66. package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +3 -1
  67. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +32 -58
  68. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +3 -1
  69. package/deps/rocksdb/rocksdb/db/convenience.cc +20 -3
  70. package/deps/rocksdb/rocksdb/db/convenience_impl.h +15 -0
  71. package/deps/rocksdb/rocksdb/db/corruption_test.cc +17 -0
  72. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +1 -0
  73. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +46 -10
  74. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -3
  75. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +74 -15
  76. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +27 -3
  77. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +850 -44
  78. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +2 -29
  79. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +275 -1
  80. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +52 -19
  81. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +6 -5
  82. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +733 -320
  83. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +155 -66
  84. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +516 -155
  85. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +8 -4
  86. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +2 -1
  87. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +17 -4
  88. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +100 -35
  89. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +95 -50
  90. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +13 -9
  91. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +136 -79
  92. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +6 -95
  93. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +31 -22
  94. package/deps/rocksdb/rocksdb/db/db_info_dumper.cc +6 -0
  95. package/deps/rocksdb/rocksdb/db/db_iter.cc +85 -57
  96. package/deps/rocksdb/rocksdb/db/db_iter.h +11 -2
  97. package/deps/rocksdb/rocksdb/db/db_iter_test.cc +29 -0
  98. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +276 -21
  99. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +35 -0
  100. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +4 -11
  101. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +193 -7
  102. package/deps/rocksdb/rocksdb/db/db_options_test.cc +294 -26
  103. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +26 -36
  104. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +364 -0
  105. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +13 -3
  106. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +52 -0
  107. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +74 -1
  108. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +22 -4
  109. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +1 -1
  110. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +1 -0
  111. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +282 -167
  112. package/deps/rocksdb/rocksdb/db/db_test.cc +180 -49
  113. package/deps/rocksdb/rocksdb/db/db_test2.cc +84 -12
  114. package/deps/rocksdb/rocksdb/db/db_test_util.cc +25 -12
  115. package/deps/rocksdb/rocksdb/db/db_test_util.h +45 -2
  116. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +14 -1
  117. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +245 -0
  118. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +480 -1
  119. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +6 -6
  120. package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -2
  121. package/deps/rocksdb/rocksdb/db/dbformat.cc +36 -0
  122. package/deps/rocksdb/rocksdb/db/dbformat.h +169 -20
  123. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +129 -0
  124. package/deps/rocksdb/rocksdb/db/deletefile_test.cc +2 -0
  125. package/deps/rocksdb/rocksdb/db/error_handler.cc +67 -34
  126. package/deps/rocksdb/rocksdb/db/error_handler.h +13 -9
  127. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
  128. package/deps/rocksdb/rocksdb/db/event_helpers.cc +4 -0
  129. package/deps/rocksdb/rocksdb/db/experimental.cc +2 -1
  130. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +4 -4
  131. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +17 -8
  132. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +144 -4
  133. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +1 -1
  134. package/deps/rocksdb/rocksdb/db/file_indexer.cc +2 -4
  135. package/deps/rocksdb/rocksdb/db/flush_job.cc +105 -17
  136. package/deps/rocksdb/rocksdb/db/flush_job.h +27 -4
  137. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +90 -12
  138. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +2 -3
  139. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +159 -91
  140. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +19 -10
  141. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +143 -0
  142. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -1
  143. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  144. package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -1
  145. package/deps/rocksdb/rocksdb/db/log_reader.h +3 -2
  146. package/deps/rocksdb/rocksdb/db/log_test.cc +17 -21
  147. package/deps/rocksdb/rocksdb/db/log_writer.cc +1 -1
  148. package/deps/rocksdb/rocksdb/db/log_writer.h +3 -2
  149. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -3
  150. package/deps/rocksdb/rocksdb/db/memtable.cc +70 -83
  151. package/deps/rocksdb/rocksdb/db/memtable.h +45 -1
  152. package/deps/rocksdb/rocksdb/db/memtable_list.cc +45 -11
  153. package/deps/rocksdb/rocksdb/db/memtable_list.h +43 -2
  154. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +91 -5
  155. package/deps/rocksdb/rocksdb/db/merge_helper.cc +330 -115
  156. package/deps/rocksdb/rocksdb/db/merge_helper.h +100 -12
  157. package/deps/rocksdb/rocksdb/db/merge_operator.cc +82 -0
  158. package/deps/rocksdb/rocksdb/db/merge_test.cc +267 -0
  159. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +5 -2
  160. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +4 -4
  161. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +3 -0
  162. package/deps/rocksdb/rocksdb/db/prefix_test.cc +1 -0
  163. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +4 -0
  164. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +4 -0
  165. package/deps/rocksdb/rocksdb/db/repair.cc +25 -7
  166. package/deps/rocksdb/rocksdb/db/repair_test.cc +143 -2
  167. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +459 -74
  168. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +105 -69
  169. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +83 -46
  170. package/deps/rocksdb/rocksdb/db/table_cache.cc +76 -54
  171. package/deps/rocksdb/rocksdb/db/table_cache.h +18 -12
  172. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
  173. package/deps/rocksdb/rocksdb/db/version_builder.cc +0 -1
  174. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +236 -204
  175. package/deps/rocksdb/rocksdb/db/version_edit.cc +66 -4
  176. package/deps/rocksdb/rocksdb/db/version_edit.h +58 -10
  177. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +80 -8
  178. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +12 -0
  179. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +86 -17
  180. package/deps/rocksdb/rocksdb/db/version_set.cc +207 -110
  181. package/deps/rocksdb/rocksdb/db/version_set.h +36 -15
  182. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -5
  183. package/deps/rocksdb/rocksdb/db/version_set_test.cc +47 -26
  184. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +525 -0
  185. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +6 -22
  186. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -20
  187. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +0 -29
  188. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +46 -0
  189. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +40 -0
  190. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper_test.cc +39 -0
  191. package/deps/rocksdb/rocksdb/db/write_batch.cc +55 -20
  192. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +3 -0
  193. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +16 -0
  194. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -0
  195. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +4 -4
  196. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +4 -7
  197. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +88 -10
  198. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +37 -13
  199. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +110 -58
  200. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +42 -0
  201. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +68 -17
  202. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +34 -0
  203. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +8 -1
  204. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +429 -237
  205. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +13 -6
  206. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +21 -14
  207. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_wide_merge_operator.cc +51 -0
  208. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_wide_merge_operator.h +27 -0
  209. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +3 -6
  210. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +2 -0
  211. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +29 -38
  212. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +302 -101
  213. package/deps/rocksdb/rocksdb/env/env.cc +6 -2
  214. package/deps/rocksdb/rocksdb/env/env_encryption.cc +11 -165
  215. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +0 -17
  216. package/deps/rocksdb/rocksdb/env/env_posix.cc +6 -2
  217. package/deps/rocksdb/rocksdb/env/env_test.cc +86 -2
  218. package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
  219. package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +79 -0
  220. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +34 -0
  221. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -0
  222. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +15 -4
  223. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +100 -70
  224. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +64 -18
  225. package/deps/rocksdb/rocksdb/file/file_util.cc +10 -5
  226. package/deps/rocksdb/rocksdb/file/file_util.h +13 -1
  227. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +1225 -97
  228. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +72 -33
  229. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -16
  230. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +23 -12
  231. package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +3 -0
  232. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +40 -14
  233. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +163 -91
  234. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +112 -2
  235. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +108 -16
  236. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +11 -0
  237. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -0
  238. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +42 -2
  239. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +1 -1
  240. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +92 -12
  241. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +34 -4
  242. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +9 -109
  243. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +91 -13
  244. package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +8 -3
  245. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +10 -4
  246. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +7 -0
  247. package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +1 -1
  248. package/deps/rocksdb/rocksdb/include/rocksdb/merge_operator.h +55 -4
  249. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +130 -22
  250. package/deps/rocksdb/rocksdb/include/rocksdb/port_defs.h +4 -0
  251. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +9 -0
  252. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +92 -9
  253. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +2 -1
  254. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +5 -1
  255. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +37 -2
  256. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +35 -0
  257. package/deps/rocksdb/rocksdb/include/rocksdb/system_clock.h +15 -0
  258. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +7 -1
  259. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +20 -3
  260. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +7 -0
  261. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +7 -0
  262. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +6 -1
  263. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +33 -2
  264. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +2 -1
  265. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +14 -0
  266. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +42 -2
  267. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +0 -3
  268. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  269. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +53 -2
  270. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +3 -2
  271. package/deps/rocksdb/rocksdb/memory/arena_test.cc +18 -11
  272. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +4 -3
  273. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +1 -1
  274. package/deps/rocksdb/rocksdb/microbench/README.md +60 -0
  275. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +69 -34
  276. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +1 -1
  277. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +22 -1
  278. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +18 -7
  279. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +14 -0
  280. package/deps/rocksdb/rocksdb/options/cf_options.cc +19 -0
  281. package/deps/rocksdb/rocksdb/options/cf_options.h +10 -2
  282. package/deps/rocksdb/rocksdb/options/customizable_test.cc +6 -1
  283. package/deps/rocksdb/rocksdb/options/db_options.cc +54 -2
  284. package/deps/rocksdb/rocksdb/options/db_options.h +4 -0
  285. package/deps/rocksdb/rocksdb/options/options.cc +15 -1
  286. package/deps/rocksdb/rocksdb/options/options_helper.cc +18 -0
  287. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +14 -4
  288. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -1
  289. package/deps/rocksdb/rocksdb/plugin/README.md +43 -0
  290. package/deps/rocksdb/rocksdb/port/README +10 -0
  291. package/deps/rocksdb/rocksdb/port/mmap.h +20 -0
  292. package/deps/rocksdb/rocksdb/port/port_example.h +1 -1
  293. package/deps/rocksdb/rocksdb/port/port_posix.cc +1 -1
  294. package/deps/rocksdb/rocksdb/port/port_posix.h +7 -4
  295. package/deps/rocksdb/rocksdb/port/stack_trace.cc +32 -12
  296. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -1
  297. package/deps/rocksdb/rocksdb/port/win/port_win.h +5 -2
  298. package/deps/rocksdb/rocksdb/src.mk +10 -1
  299. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  300. package/deps/rocksdb/rocksdb/table/block_based/block.cc +48 -22
  301. package/deps/rocksdb/rocksdb/table/block_based/block.h +60 -12
  302. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +116 -43
  303. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +9 -6
  304. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +321 -49
  305. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +98 -4
  306. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +233 -98
  307. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +58 -23
  308. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +12 -8
  309. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +52 -24
  310. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +219 -51
  311. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +41 -8
  312. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -1
  313. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +3 -1
  314. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +26 -7
  315. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +50 -18
  316. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +20 -8
  317. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +232 -71
  318. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +6 -6
  319. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +44 -26
  320. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +2 -1
  321. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
  322. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +31 -16
  323. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +97 -58
  324. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +2 -2
  325. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +6 -0
  326. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +36 -19
  327. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +3 -1
  328. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +114 -70
  329. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -3
  330. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +11 -7
  331. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +15 -3
  332. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +6 -3
  333. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
  334. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +14 -13
  335. package/deps/rocksdb/rocksdb/table/block_fetcher.h +4 -0
  336. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +9 -2
  337. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +1 -0
  338. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +6 -2
  339. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +1 -2
  340. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +2 -3
  341. package/deps/rocksdb/rocksdb/table/format.cc +175 -33
  342. package/deps/rocksdb/rocksdb/table/format.h +63 -10
  343. package/deps/rocksdb/rocksdb/table/get_context.cc +52 -89
  344. package/deps/rocksdb/rocksdb/table/get_context.h +12 -3
  345. package/deps/rocksdb/rocksdb/table/internal_iterator.h +11 -0
  346. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +29 -1
  347. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +22 -2
  348. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +12 -4
  349. package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -0
  350. package/deps/rocksdb/rocksdb/table/mock_table.cc +8 -3
  351. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +10 -5
  352. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +10 -1
  353. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +1 -2
  354. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +3 -3
  355. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +45 -9
  356. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +1 -0
  357. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +24 -1
  358. package/deps/rocksdb/rocksdb/table/table_builder.h +6 -2
  359. package/deps/rocksdb/rocksdb/table/table_properties.cc +6 -0
  360. package/deps/rocksdb/rocksdb/table/table_reader.h +6 -0
  361. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -22
  362. package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +31 -0
  363. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +2 -1
  364. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +19 -7
  365. package/deps/rocksdb/rocksdb/test_util/sync_point.h +3 -1
  366. package/deps/rocksdb/rocksdb/test_util/testutil.cc +29 -0
  367. package/deps/rocksdb/rocksdb/test_util/testutil.h +19 -0
  368. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +3 -3
  369. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +87 -65
  370. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +221 -33
  371. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +36 -0
  372. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -1
  373. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +1 -0
  374. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +33 -11
  375. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +4 -0
  376. package/deps/rocksdb/rocksdb/unreleased_history/README.txt +73 -0
  377. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +27 -0
  378. package/deps/rocksdb/rocksdb/unreleased_history/behavior_changes/.gitkeep +0 -0
  379. package/deps/rocksdb/rocksdb/unreleased_history/bug_fixes/.gitkeep +0 -0
  380. package/deps/rocksdb/rocksdb/unreleased_history/new_features/.gitkeep +0 -0
  381. package/deps/rocksdb/rocksdb/unreleased_history/performance_improvements/.gitkeep +0 -0
  382. package/deps/rocksdb/rocksdb/unreleased_history/public_api_changes/.gitkeep +0 -0
  383. package/deps/rocksdb/rocksdb/unreleased_history/release.sh +104 -0
  384. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +5 -0
  385. package/deps/rocksdb/rocksdb/util/bloom_impl.h +3 -3
  386. package/deps/rocksdb/rocksdb/util/bloom_test.cc +32 -11
  387. package/deps/rocksdb/rocksdb/util/cast_util.h +24 -0
  388. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -0
  389. package/deps/rocksdb/rocksdb/util/comparator.cc +55 -8
  390. package/deps/rocksdb/rocksdb/util/compression.cc +4 -4
  391. package/deps/rocksdb/rocksdb/util/compression.h +119 -35
  392. package/deps/rocksdb/rocksdb/util/core_local.h +2 -1
  393. package/deps/rocksdb/rocksdb/util/crc32c.cc +7 -1
  394. package/deps/rocksdb/rocksdb/util/distributed_mutex.h +1 -1
  395. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +4 -4
  396. package/deps/rocksdb/rocksdb/util/filelock_test.cc +3 -0
  397. package/deps/rocksdb/rocksdb/util/hash.h +7 -3
  398. package/deps/rocksdb/rocksdb/util/hash_test.cc +44 -0
  399. package/deps/rocksdb/rocksdb/util/math.h +58 -6
  400. package/deps/rocksdb/rocksdb/util/math128.h +29 -7
  401. package/deps/rocksdb/rocksdb/util/mutexlock.h +35 -27
  402. package/deps/rocksdb/rocksdb/util/overload.h +23 -0
  403. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +53 -18
  404. package/deps/rocksdb/rocksdb/util/rate_limiter_impl.h +6 -1
  405. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +90 -19
  406. package/deps/rocksdb/rocksdb/util/single_thread_executor.h +1 -0
  407. package/deps/rocksdb/rocksdb/util/slice_test.cc +30 -0
  408. package/deps/rocksdb/rocksdb/util/status.cc +1 -0
  409. package/deps/rocksdb/rocksdb/util/stop_watch.h +1 -1
  410. package/deps/rocksdb/rocksdb/util/string_util.cc +39 -0
  411. package/deps/rocksdb/rocksdb/util/string_util.h +10 -0
  412. package/deps/rocksdb/rocksdb/util/thread_operation.h +10 -1
  413. package/deps/rocksdb/rocksdb/util/udt_util.cc +385 -0
  414. package/deps/rocksdb/rocksdb/util/udt_util.h +192 -1
  415. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +461 -0
  416. package/deps/rocksdb/rocksdb/util/write_batch_util.cc +25 -0
  417. package/deps/rocksdb/rocksdb/util/write_batch_util.h +80 -0
  418. package/deps/rocksdb/rocksdb/util/xxhash.h +0 -3
  419. package/deps/rocksdb/rocksdb/util/xxph3.h +0 -4
  420. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -4
  421. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +71 -26
  422. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +7 -6
  423. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +1 -1
  424. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +2 -3
  425. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +6 -11
  426. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -2
  427. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +4 -5
  428. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +1 -0
  429. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +20 -16
  430. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +11 -7
  431. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +2 -2
  432. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +7 -1
  433. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +3 -0
  434. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +12 -3
  435. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +1 -2
  436. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +7 -4
  437. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +2 -3
  438. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +2 -2
  439. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
  440. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README +13 -0
  441. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +23 -8
  442. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +9 -6
  443. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +37 -12
  444. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +272 -33
  445. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +15 -9
  446. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +4 -1
  447. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +76 -20
  448. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +18 -9
  449. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +195 -23
  450. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +19 -12
  451. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +88 -1
  452. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -1
  453. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +43 -17
  454. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +6 -3
  455. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +73 -24
  456. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +19 -4
  457. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +60 -107
  458. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +41 -12
  459. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +6 -3
  460. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +15 -8
  461. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h +1 -1
  462. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +10 -5
  463. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  464. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +1 -1
  465. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +59 -28
  466. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +127 -120
  467. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +129 -59
  468. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +111 -14
  469. package/deps/rocksdb/rocksdb.gyp +6 -2
  470. package/index.js +0 -8
  471. package/package.json +1 -1
  472. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  473. package/prebuilds/linux-x64/node.napi.node +0 -0
  474. package/deps/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake +0 -7
  475. package/deps/rocksdb/rocksdb/cmake/modules/FindJeMalloc.cmake +0 -29
  476. package/deps/rocksdb/rocksdb/cmake/modules/FindNUMA.cmake +0 -29
  477. package/deps/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake +0 -29
  478. package/deps/rocksdb/rocksdb/cmake/modules/FindTBB.cmake +0 -33
  479. package/deps/rocksdb/rocksdb/cmake/modules/Findgflags.cmake +0 -29
  480. package/deps/rocksdb/rocksdb/cmake/modules/Findlz4.cmake +0 -29
  481. package/deps/rocksdb/rocksdb/cmake/modules/Finduring.cmake +0 -26
  482. package/deps/rocksdb/rocksdb/cmake/modules/Findzstd.cmake +0 -29
  483. package/deps/rocksdb/rocksdb/cmake/modules/ReadVersion.cmake +0 -10
@@ -20,16 +20,19 @@
20
20
  #include "cache/sharded_cache.h"
21
21
  #include "port/lang.h"
22
22
  #include "port/malloc.h"
23
+ #include "port/mmap.h"
23
24
  #include "port/port.h"
24
25
  #include "rocksdb/cache.h"
25
26
  #include "rocksdb/secondary_cache.h"
26
27
  #include "util/autovector.h"
28
+ #include "util/math.h"
27
29
 
28
30
  namespace ROCKSDB_NAMESPACE {
29
31
 
30
32
  namespace clock_cache {
31
33
 
32
34
  // Forward declaration of friend class.
35
+ template <class ClockCache>
33
36
  class ClockCacheTest;
34
37
 
35
38
  // HyperClockCache is an alternative to LRUCache specifically tailored for
@@ -37,24 +40,31 @@ class ClockCacheTest;
37
40
  //
38
41
  // Benefits
39
42
  // --------
40
- // * Fully lock free (no waits or spins) for efficiency under high concurrency
43
+ // * Lock/wait free (no waits or spins) for efficiency under high concurrency
44
+ // * Fixed version (estimated_entry_charge > 0) is fully lock/wait free
45
+ // * Automatic version (estimated_entry_charge = 0) has rare waits among
46
+ // certain insertion or erase operations that involve the same very small
47
+ // set of entries.
41
48
  // * Optimized for hot path reads. For concurrency control, most Lookup() and
42
49
  // essentially all Release() are a single atomic add operation.
43
- // * Eviction on insertion is fully parallel and lock-free.
50
+ // * Eviction on insertion is fully parallel.
44
51
  // * Uses a generalized + aging variant of CLOCK eviction that might outperform
45
52
  // LRU in some cases. (For background, see
46
53
  // https://en.wikipedia.org/wiki/Page_replacement_algorithm)
47
54
  //
48
55
  // Costs
49
56
  // -----
50
- // * Hash table is not resizable (for lock-free efficiency) so capacity is not
51
- // dynamically changeable. Rely on an estimated average value (block) size for
57
+ // * FixedHyperClockCache (estimated_entry_charge > 0) - Hash table is not
58
+ // resizable (for lock-free efficiency) so capacity is not dynamically
59
+ // changeable. Rely on an estimated average value (block) size for
52
60
  // space+time efficiency. (See estimated_entry_charge option details.)
61
+ // EXPERIMENTAL - This limitation is fixed in AutoHyperClockCache, activated
62
+ // with estimated_entry_charge == 0.
53
63
  // * Insert usually does not (but might) overwrite a previous entry associated
54
- // with a cache key. This is OK for RocksDB uses of Cache.
64
+ // with a cache key. This is OK for RocksDB uses of Cache, though it does mess
65
+ // up our REDUNDANT block cache insertion statistics.
55
66
  // * Only supports keys of exactly 16 bytes, which is what RocksDB uses for
56
- // block cache (not row cache or table cache).
57
- // * SecondaryCache is not supported.
67
+ // block cache (but not row cache or table cache).
58
68
  // * Cache priorities are less aggressively enforced. Unlike LRUCache, enough
59
69
  // transient LOW or BOTTOM priority items can evict HIGH priority entries that
60
70
  // are not referenced recently (or often) enough.
@@ -137,7 +147,8 @@ class ClockCacheTest;
137
147
  // * Empty - slot is not in use and unowned. All other metadata and data is
138
148
  // in an undefined state.
139
149
  // * Construction - slot is exclusively owned by one thread, the thread
140
- // successfully entering this state, for populating or freeing data.
150
+ // successfully entering this state, for populating or freeing data
151
+ // (de-construction, same state marker).
141
152
  // * Shareable (group) - slot holds an entry with counted references for
142
153
  // pinning and reading, including
143
154
  // * Visible - slot holds an entry that can be returned by Lookup
@@ -185,15 +196,19 @@ class ClockCacheTest;
185
196
  // know from our "redundant" stats that overwrites are very rare for the block
186
197
  // cache, so we should not spend much to make them effective.
187
198
  //
188
- // So instead we Insert as soon as we find an empty slot in the probing
189
- // sequence without seeing an existing (visible) entry for the same key. This
190
- // way we only insert if we can improve the probing performance, and we don't
191
- // need to probe beyond our insert position, assuming we are willing to let
192
- // the previous entry for the same key die of old age (eventual eviction from
193
- // not being used). We can reach a similar state with concurrent insertions,
194
- // where one will pass over the other while it is "under construction."
195
- // This temporary duplication is acceptable for RocksDB block cache because
196
- // we know redundant insertion is rare.
199
+ // FixedHyperClockCache: Instead we Insert as soon as we find an empty slot in
200
+ // the probing sequence without seeing an existing (visible) entry for the same
201
+ // key. This way we only insert if we can improve the probing performance, and
202
+ // we don't need to probe beyond our insert position, assuming we are willing
203
+ // to let the previous entry for the same key die of old age (eventual eviction
204
+ // from not being used). We can reach a similar state with concurrent
205
+ // insertions, where one will pass over the other while it is "under
206
+ // construction." This temporary duplication is acceptable for RocksDB block
207
+ // cache because we know redundant insertion is rare.
208
+ // AutoHyperClockCache: Similar, except we only notice and return an existing
209
+ // match if it is found in the search for a suitable empty slot (starting with
210
+ // the same slot as the head pointer), not by following the existing chain of
211
+ // entries. Insertions are always made to the head of the chain.
197
212
  //
198
213
  // Another problem to solve is what to return to the caller when we find an
199
214
  // existing entry whose probing position we cannot improve on, or when the
@@ -281,29 +296,6 @@ class ClockCacheTest;
281
296
 
282
297
  // ----------------------------------------------------------------------- //
283
298
 
284
- // The load factor p is a real number in (0, 1) such that at all
285
- // times at most a fraction p of all slots, without counting tombstones,
286
- // are occupied by elements. This means that the probability that a random
287
- // probe hits an occupied slot is at most p, and thus at most 1/p probes
288
- // are required on average. For example, p = 70% implies that between 1 and 2
289
- // probes are needed on average (bear in mind that this reasoning doesn't
290
- // consider the effects of clustering over time, which should be negligible
291
- // with double hashing).
292
- // Because the size of the hash table is always rounded up to the next
293
- // power of 2, p is really an upper bound on the actual load factor---the
294
- // actual load factor is anywhere between p/2 and p. This is a bit wasteful,
295
- // but bear in mind that slots only hold metadata, not actual values.
296
- // Since space cost is dominated by the values (the LSM blocks),
297
- // overprovisioning the table with metadata only increases the total cache space
298
- // usage by a tiny fraction.
299
- constexpr double kLoadFactor = 0.7;
300
-
301
- // The user can exceed kLoadFactor if the sizes of the inserted values don't
302
- // match estimated_value_size, or in some rare cases with
303
- // strict_capacity_limit == false. To avoid degenerate performance, we set a
304
- // strict upper bound on the load factor.
305
- constexpr double kStrictLoadFactor = 0.84;
306
-
307
299
  struct ClockHandleBasicData {
308
300
  Cache::ObjectPtr value = nullptr;
309
301
  const Cache::CacheItemHelper* helper = nullptr;
@@ -326,7 +318,7 @@ struct ClockHandle : public ClockHandleBasicData {
326
318
  // state of the handle. The meta word looks like this:
327
319
  // low bits high bits
328
320
  // -----------------------------------------------------------------------
329
- // | acquire counter | release counter | state marker |
321
+ // | acquire counter | release counter | hit bit | state marker |
330
322
  // -----------------------------------------------------------------------
331
323
 
332
324
  // For reading or updating counters in meta word.
@@ -340,8 +332,12 @@ struct ClockHandle : public ClockHandleBasicData {
340
332
  static constexpr uint64_t kReleaseIncrement = uint64_t{1}
341
333
  << kReleaseCounterShift;
342
334
 
335
+ // For setting the hit bit
336
+ static constexpr uint8_t kHitBitShift = 2U * kCounterNumBits;
337
+ static constexpr uint64_t kHitBitMask = uint64_t{1} << kHitBitShift;
338
+
343
339
  // For reading or updating the state marker in meta word
344
- static constexpr uint8_t kStateShift = 2U * kCounterNumBits;
340
+ static constexpr uint8_t kStateShift = kHitBitShift + 1;
345
341
 
346
342
  // Bits contribution to state marker.
347
343
  // Occupied means any state other than empty
@@ -371,14 +367,133 @@ struct ClockHandle : public ClockHandleBasicData {
371
367
  static constexpr uint8_t kMaxCountdown = kHighCountdown;
372
368
  // TODO: make these coundown values tuning parameters for eviction?
373
369
 
374
- // See above
375
- std::atomic<uint64_t> meta{};
376
-
377
- // Anticipating use for SecondaryCache support
378
- void* reserved_for_future_use = nullptr;
370
+ // See above. Mutable for read reference counting.
371
+ mutable std::atomic<uint64_t> meta{};
379
372
  }; // struct ClockHandle
380
373
 
381
- class HyperClockTable {
374
+ class BaseClockTable {
375
+ public:
376
+ BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy,
377
+ MemoryAllocator* allocator,
378
+ const Cache::EvictionCallback* eviction_callback,
379
+ const uint32_t* hash_seed)
380
+ : metadata_charge_policy_(metadata_charge_policy),
381
+ allocator_(allocator),
382
+ eviction_callback_(*eviction_callback),
383
+ hash_seed_(*hash_seed) {}
384
+
385
+ template <class Table>
386
+ typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
387
+ size_t capacity,
388
+ bool strict_capacity_limit,
389
+ bool allow_uncharged);
390
+
391
+ template <class Table>
392
+ Status Insert(const ClockHandleBasicData& proto,
393
+ typename Table::HandleImpl** handle, Cache::Priority priority,
394
+ size_t capacity, bool strict_capacity_limit);
395
+
396
+ void Ref(ClockHandle& handle);
397
+
398
+ size_t GetOccupancy() const {
399
+ return occupancy_.load(std::memory_order_relaxed);
400
+ }
401
+
402
+ size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
403
+
404
+ size_t GetStandaloneUsage() const {
405
+ return standalone_usage_.load(std::memory_order_relaxed);
406
+ }
407
+
408
+ uint32_t GetHashSeed() const { return hash_seed_; }
409
+
410
+ uint64_t GetYieldCount() const { return yield_count_.load(); }
411
+
412
+ struct EvictionData {
413
+ size_t freed_charge = 0;
414
+ size_t freed_count = 0;
415
+ };
416
+
417
+ void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data);
418
+
419
+ #ifndef NDEBUG
420
+ // Acquire N references
421
+ void TEST_RefN(ClockHandle& handle, size_t n);
422
+ // Helper for TEST_ReleaseN
423
+ void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n);
424
+ #endif
425
+
426
+ private: // fns
427
+ // Creates a "standalone" handle for returning from an Insert operation that
428
+ // cannot be completed by actually inserting into the table.
429
+ // Updates `standalone_usage_` but not `usage_` nor `occupancy_`.
430
+ template <class HandleImpl>
431
+ HandleImpl* StandaloneInsert(const ClockHandleBasicData& proto);
432
+
433
+ // Helper for updating `usage_` for new entry with given `total_charge`
434
+ // and evicting if needed under strict_capacity_limit=true rules. This
435
+ // means the operation might fail with Status::MemoryLimit. If
436
+ // `need_evict_for_occupancy`, then eviction of at least one entry is
437
+ // required, and the operation should fail if not possible.
438
+ // NOTE: Otherwise, occupancy_ is not managed in this function
439
+ template <class Table>
440
+ Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity,
441
+ bool need_evict_for_occupancy,
442
+ typename Table::InsertState& state);
443
+
444
+ // Helper for updating `usage_` for new entry with given `total_charge`
445
+ // and evicting if needed under strict_capacity_limit=false rules. This
446
+ // means that updating `usage_` always succeeds even if forced to exceed
447
+ // capacity. If `need_evict_for_occupancy`, then eviction of at least one
448
+ // entry is required, and the operation should return false if such eviction
449
+ // is not possible. `usage_` is not updated in that case. Otherwise, returns
450
+ // true, indicating success.
451
+ // NOTE: occupancy_ is not managed in this function
452
+ template <class Table>
453
+ bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity,
454
+ bool need_evict_for_occupancy,
455
+ typename Table::InsertState& state);
456
+
457
+ protected: // data
458
+ // We partition the following members into different cache lines
459
+ // to avoid false sharing among Lookup, Release, Erase and Insert
460
+ // operations in ClockCacheShard.
461
+
462
+ // Clock algorithm sweep pointer.
463
+ std::atomic<uint64_t> clock_pointer_{};
464
+
465
+ // Counter for number of times we yield to wait on another thread.
466
+ std::atomic<uint64_t> yield_count_{};
467
+
468
+ // TODO: is this separation needed if we don't do background evictions?
469
+ ALIGN_AS(CACHE_LINE_SIZE)
470
+ // Number of elements in the table.
471
+ std::atomic<size_t> occupancy_{};
472
+
473
+ // Memory usage by entries tracked by the cache (including standalone)
474
+ std::atomic<size_t> usage_{};
475
+
476
+ // Part of usage by standalone entries (not in table)
477
+ std::atomic<size_t> standalone_usage_{};
478
+
479
+ ALIGN_AS(CACHE_LINE_SIZE)
480
+ const CacheMetadataChargePolicy metadata_charge_policy_;
481
+
482
+ // From Cache, for deleter
483
+ MemoryAllocator* const allocator_;
484
+
485
+ // A reference to Cache::eviction_callback_
486
+ const Cache::EvictionCallback& eviction_callback_;
487
+
488
+ // A reference to ShardedCacheBase::hash_seed_
489
+ const uint32_t& hash_seed_;
490
+ };
491
+
492
+ // Hash table for cache entries with size determined at creation time.
493
+ // Uses open addressing and double hashing. Since entries cannot be moved,
494
+ // the "displacements" count ensures probing sequences find entries even when
495
+ // entries earlier in the probing sequence have been removed.
496
+ class FixedHyperClockTable : public BaseClockTable {
382
497
  public:
383
498
  // Target size to be exactly a common cache line size (see static_assert in
384
499
  // clock_cache.cc)
@@ -400,88 +515,104 @@ class HyperClockTable {
400
515
  }; // struct HandleImpl
401
516
 
402
517
  struct Opts {
518
+ explicit Opts(size_t _estimated_value_size)
519
+ : estimated_value_size(_estimated_value_size) {}
520
+ explicit Opts(const HyperClockCacheOptions& opts) {
521
+ assert(opts.estimated_entry_charge > 0);
522
+ estimated_value_size = opts.estimated_entry_charge;
523
+ }
403
524
  size_t estimated_value_size;
404
525
  };
405
526
 
406
- HyperClockTable(size_t capacity, bool strict_capacity_limit,
407
- CacheMetadataChargePolicy metadata_charge_policy,
408
- MemoryAllocator* allocator,
409
- const Cache::EvictionCallback* eviction_callback,
410
- const uint32_t* hash_seed, const Opts& opts);
411
- ~HyperClockTable();
527
+ FixedHyperClockTable(size_t capacity, bool strict_capacity_limit,
528
+ CacheMetadataChargePolicy metadata_charge_policy,
529
+ MemoryAllocator* allocator,
530
+ const Cache::EvictionCallback* eviction_callback,
531
+ const uint32_t* hash_seed, const Opts& opts);
532
+ ~FixedHyperClockTable();
533
+
534
+ // For BaseClockTable::Insert
535
+ struct InsertState {};
412
536
 
413
- Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle,
414
- Cache::Priority priority, size_t capacity,
415
- bool strict_capacity_limit);
537
+ void StartInsert(InsertState& state);
416
538
 
417
- HandleImpl* CreateStandalone(ClockHandleBasicData& proto, size_t capacity,
418
- bool strict_capacity_limit,
419
- bool allow_uncharged);
539
+ // Returns true iff there is room for the proposed number of entries.
540
+ bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
541
+
542
+ HandleImpl* DoInsert(const ClockHandleBasicData& proto,
543
+ uint64_t initial_countdown, bool take_ref,
544
+ InsertState& state);
545
+
546
+ // Runs the clock eviction algorithm trying to reclaim at least
547
+ // requested_charge. Returns how much is evicted, which could be less
548
+ // if it appears impossible to evict the requested amount without blocking.
549
+ void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
420
550
 
421
551
  HandleImpl* Lookup(const UniqueId64x2& hashed_key);
422
552
 
423
553
  bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
424
554
 
425
- void Ref(HandleImpl& handle);
426
-
427
555
  void Erase(const UniqueId64x2& hashed_key);
428
556
 
429
- void ConstApplyToEntriesRange(std::function<void(const HandleImpl&)> func,
430
- size_t index_begin, size_t index_end,
431
- bool apply_if_will_be_deleted) const;
432
-
433
557
  void EraseUnRefEntries();
434
558
 
435
559
  size_t GetTableSize() const { return size_t{1} << length_bits_; }
436
560
 
437
- int GetLengthBits() const { return length_bits_; }
438
-
439
- size_t GetOccupancy() const {
440
- return occupancy_.load(std::memory_order_relaxed);
441
- }
442
-
443
561
  size_t GetOccupancyLimit() const { return occupancy_limit_; }
444
562
 
445
- size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
563
+ const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; }
446
564
 
447
- size_t GetStandaloneUsage() const {
448
- return standalone_usage_.load(std::memory_order_relaxed);
565
+ #ifndef NDEBUG
566
+ size_t& TEST_MutableOccupancyLimit() {
567
+ return const_cast<size_t&>(occupancy_limit_);
449
568
  }
450
569
 
451
- uint32_t GetHashSeed() const { return hash_seed_; }
452
-
453
- // Acquire/release N references
454
- void TEST_RefN(HandleImpl& handle, size_t n);
570
+ // Release N references
455
571
  void TEST_ReleaseN(HandleImpl* handle, size_t n);
572
+ #endif
573
+
574
+ // The load factor p is a real number in (0, 1) such that at all
575
+ // times at most a fraction p of all slots, without counting tombstones,
576
+ // are occupied by elements. This means that the probability that a random
577
+ // probe hits an occupied slot is at most p, and thus at most 1/p probes
578
+ // are required on average. For example, p = 70% implies that between 1 and 2
579
+ // probes are needed on average (bear in mind that this reasoning doesn't
580
+ // consider the effects of clustering over time, which should be negligible
581
+ // with double hashing).
582
+ // Because the size of the hash table is always rounded up to the next
583
+ // power of 2, p is really an upper bound on the actual load factor---the
584
+ // actual load factor is anywhere between p/2 and p. This is a bit wasteful,
585
+ // but bear in mind that slots only hold metadata, not actual values.
586
+ // Since space cost is dominated by the values (the LSM blocks),
587
+ // overprovisioning the table with metadata only increases the total cache
588
+ // space usage by a tiny fraction.
589
+ static constexpr double kLoadFactor = 0.7;
590
+
591
+ // The user can exceed kLoadFactor if the sizes of the inserted values don't
592
+ // match estimated_value_size, or in some rare cases with
593
+ // strict_capacity_limit == false. To avoid degenerate performance, we set a
594
+ // strict upper bound on the load factor.
595
+ static constexpr double kStrictLoadFactor = 0.84;
456
596
 
457
597
  private: // functions
458
598
  // Returns x mod 2^{length_bits_}.
459
599
  inline size_t ModTableSize(uint64_t x) {
460
- return static_cast<size_t>(x) & length_bits_mask_;
600
+ return BitwiseAnd(x, length_bits_mask_);
461
601
  }
462
602
 
463
- // Runs the clock eviction algorithm trying to reclaim at least
464
- // requested_charge. Returns how much is evicted, which could be less
465
- // if it appears impossible to evict the requested amount without blocking.
466
- inline void Evict(size_t requested_charge, size_t* freed_charge,
467
- size_t* freed_count);
468
-
469
- // Returns the first slot in the probe sequence, starting from the given
470
- // probe number, with a handle e such that match(e) is true. At every
471
- // step, the function first tests whether match(e) holds. If this is false,
472
- // it evaluates abort(e) to decide whether the search should be aborted,
473
- // and in the affirmative returns -1. For every handle e probed except
474
- // the last one, the function runs update(e).
475
- // The probe parameter is modified as follows. We say a probe to a handle
476
- // e is aborting if match(e) is false and abort(e) is true. Then the final
477
- // value of probe is one more than the last non-aborting probe during the
478
- // call. This is so that that the variable can be used to keep track of
479
- // progress across consecutive calls to FindSlot.
603
+ // Returns the first slot in the probe sequence with a handle e such that
604
+ // match_fn(e) is true. At every step, the function first tests whether
605
+ // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
606
+ // whether the search should be aborted, and if so, FindSlot immediately
607
+ // returns nullptr. For every handle e that is not a match and not aborted,
608
+ // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
609
+ // slot will be the last probed because the next would cycle back to the first
610
+ // slot probed. This function uses templates instead of std::function to
611
+ // minimize the risk of heap-allocated closures being created.
612
+ template <typename MatchFn, typename AbortFn, typename UpdateFn>
480
613
  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
481
- std::function<bool(HandleImpl*)> match,
482
- std::function<bool(HandleImpl*)> stop,
483
- std::function<void(HandleImpl*)> update,
484
- size_t& probe);
614
+ const MatchFn& match_fn, const AbortFn& abort_fn,
615
+ const UpdateFn& update_fn);
485
616
 
486
617
  // Re-decrement all displacements in probe path starting from beginning
487
618
  // until (not including) the given handle
@@ -494,33 +625,6 @@ class HyperClockTable {
494
625
  // before releasing it so that it can be provided to this function.
495
626
  inline void ReclaimEntryUsage(size_t total_charge);
496
627
 
497
- // Helper for updating `usage_` for new entry with given `total_charge`
498
- // and evicting if needed under strict_capacity_limit=true rules. This
499
- // means the operation might fail with Status::MemoryLimit. If
500
- // `need_evict_for_occupancy`, then eviction of at least one entry is
501
- // required, and the operation should fail if not possible.
502
- // NOTE: Otherwise, occupancy_ is not managed in this function
503
- inline Status ChargeUsageMaybeEvictStrict(size_t total_charge,
504
- size_t capacity,
505
- bool need_evict_for_occupancy);
506
-
507
- // Helper for updating `usage_` for new entry with given `total_charge`
508
- // and evicting if needed under strict_capacity_limit=false rules. This
509
- // means that updating `usage_` always succeeds even if forced to exceed
510
- // capacity. If `need_evict_for_occupancy`, then eviction of at least one
511
- // entry is required, and the operation should return false if such eviction
512
- // is not possible. `usage_` is not updated in that case. Otherwise, returns
513
- // true, indicating success.
514
- // NOTE: occupancy_ is not managed in this function
515
- inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
516
- size_t capacity,
517
- bool need_evict_for_occupancy);
518
-
519
- // Creates a "standalone" handle for returning from an Insert operation that
520
- // cannot be completed by actually inserting into the table.
521
- // Updates `standalone_usage_` but not `usage_` nor `occupancy_`.
522
- inline HandleImpl* StandaloneInsert(const ClockHandleBasicData& proto);
523
-
524
628
  MemoryAllocator* GetAllocator() const { return allocator_; }
525
629
 
526
630
  // Returns the number of bits used to hash an element in the hash
@@ -541,39 +645,323 @@ class HyperClockTable {
541
645
 
542
646
  // Array of slots comprising the hash table.
543
647
  const std::unique_ptr<HandleImpl[]> array_;
648
+ }; // class FixedHyperClockTable
544
649
 
545
- // From Cache, for deleter
546
- MemoryAllocator* const allocator_;
650
+ // Hash table for cache entries that resizes automatically based on occupancy.
651
+ // However, it depends on a contiguous memory region to grow into
652
+ // incrementally, using linear hashing, so uses an anonymous mmap so that
653
+ // only the used portion of the memory region is mapped to physical memory
654
+ // (part of RSS).
655
+ //
656
+ // This table implementation uses the same "low-level protocol" for managing
657
+ // the contens of an entry slot as FixedHyperClockTable does, captured in the
658
+ // ClockHandle struct. The provides most of the essential data safety, but
659
+ // AutoHyperClockTable is another "high-level protocol" for organizing entries
660
+ // into a hash table, with automatic resizing.
661
+ //
662
+ // This implementation is not fully wait-free but we can call it "essentially
663
+ // wait-free," and here's why. First, like FixedHyperClockCache, there is no
664
+ // locking nor other forms of waiting at the cache or shard level. Also like
665
+ // FixedHCC there is essentially an entry-level read-write lock implemented
666
+ // with atomics, but our relaxed atomicity/consistency guarantees (e.g.
667
+ // duplicate inserts are possible) mean we do not need to wait for entry
668
+ // locking. Lookups, non-erasing Releases, and non-evicting non-growing Inserts
669
+ // are all fully wait-free. Of course, these waits are not dependent on any
670
+ // external factors such as I/O.
671
+ //
672
+ // For operations that remove entries from a chain or grow the table by
673
+ // splitting a chain, there is a chain-level locking mechanism that we call a
674
+ // "rewrite" lock, and the only waits are for these locks. On average, each
675
+ // chain lock is relevant to < 2 entries each. (The average would be less than
676
+ // one entry each, but we do not lock when there's no entry to remove or
677
+ // migrate.) And a given thread can only hold two such chain locks at a time,
678
+ // more typically just one. So in that sense alone, the waiting that does exist
679
+ // is very localized.
680
+ //
681
+ // If we look closer at the operations utilizing that locking mechanism, we
682
+ // can see why it's "essentially wait-free."
683
+ // * Grow operations to increase the size of the table: each operation splits
684
+ // an existing chain into two, and chains for splitting are chosen in table
685
+ // order. Grow operations are fully parallel except for the chain locking, but
686
+ // for one Grow operation to wait on another, it has to be feeding into the
687
+ // other, which means the table has doubled in size already from other Grow
688
+ // operations without the original one finishing. So Grow operations are very
689
+ // low latency (unlike LRUCache doubling the table size in one operation) and
690
+ // very parallelizeable. (We use some tricks to break up dependencies in
691
+ // updating metadata on the usable size of the table.) And obviously Grow
692
+ // operations are very rare after the initial population of the table.
693
+ // * Evict operations (part of many Inserts): clock updates and evictions
694
+ // sweep through the structure in table order, so like Grow operations,
695
+ // parallel Evict can only wait on each other if an Evict has lingered (slept)
696
+ // long enough that the clock pointer has wrapped around the entire structure.
697
+ // * Random erasures (Erase, Release with erase_if_last_ref, etc.): these
698
+ // operations are rare and not really considered performance critical.
699
+ // Currently they're mostly used for removing placeholder cache entries, e.g.
700
+ // for memory tracking, though that could use standalone entries instead to
701
+ // avoid potential contention in table operations. It's possible that future
702
+ // enhancements could pro-actively remove cache entries from obsolete files,
703
+ // but that's not yet implemented.
704
+ class AutoHyperClockTable : public BaseClockTable {
705
+ public:
706
+ // Target size to be exactly a common cache line size (see static_assert in
707
+ // clock_cache.cc)
708
+ struct ALIGN_AS(64U) HandleImpl : public ClockHandle {
709
+ // To orgainize AutoHyperClockTable entries into a hash table while
710
+ // allowing the table size to grow without existing entries being moved,
711
+ // a version of chaining is used. Rather than being heap allocated (and
712
+ // incurring overheads to ensure memory safety) entries must go into
713
+ // Handles ("slots") in the pre-allocated array. To improve CPU cache
714
+ // locality, the chain head pointers are interleved with the entries;
715
+ // specifically, a Handle contains
716
+ // * A head pointer for a chain of entries with this "home" location.
717
+ // * A ClockHandle, for an entry that may or may not be in the chain
718
+ // starting from that head (but for performance ideally is on that
719
+ // chain).
720
+ // * A next pointer for the continuation of the chain containing this
721
+ // entry.
722
+ //
723
+ // The pointers are not raw pointers, but are indices into the array,
724
+ // and are decorated in two ways to help detect and recover from
725
+ // relevant concurrent modifications during Lookup, so that Lookup is
726
+ // fully wait-free:
727
+ // * Each "with_shift" pointer contains a shift count that indicates
728
+ // how many hash bits were used in chosing the home address for the
729
+ // chain--specifically the next entry in the chain.
730
+ // * The end of a chain is given a special "end" marker and refers back
731
+ // to the head of the chain.
732
+ //
733
+ // Why do we need shift on each pointer? To make Lookup wait-free, we need
734
+ // to be able to query a chain without missing anything, and preferably
735
+ // avoid synchronously double-checking the length_info. Without the shifts,
736
+ // there is a risk that we start down a chain and while paused on an entry
737
+ // that goes to a new home, we then follow the rest of the
738
+ // partially-migrated chain to see the shared ending with the old home, but
739
+ // for a time were following the chain for the new home, missing some
740
+ // entries for the old home.
741
+ //
742
+ // Why do we need the end of the chain to loop back? If Lookup pauses
743
+ // at an "under construction" entry, and sees that "next" is null after
744
+ // waking up, we need something to tell whether the "under construction"
745
+ // entry was freed and reused for another chain. Otherwise, we could
746
+ // miss entries still on the original chain due in the presence of a
747
+ // concurrent modification. Until an entry is fully erased from a chain,
748
+ // it is normal to see "under construction" entries on the chain, and it
749
+ // is not safe to read their hashed key without either a read reference
750
+ // on the entry or a rewrite lock on the chain.
751
+
752
+ // Marker in a "with_shift" head pointer for some thread owning writes
753
+ // to the chain structure (except for inserts), but only if not an
754
+ // "end" pointer. Also called the "rewrite lock."
755
+ static constexpr uint64_t kHeadLocked = uint64_t{1} << 7;
756
+
757
+ // Marker in a "with_shift" pointer for the end of a chain. Must also
758
+ // point back to the head of the chain (with end marker removed).
759
+ // Also includes the "locked" bit so that attempting to lock an empty
760
+ // chain has no effect (not needed, as the lock is only needed for
761
+ // removals).
762
+ static constexpr uint64_t kNextEndFlags = (uint64_t{1} << 6) | kHeadLocked;
763
+
764
+ static inline bool IsEnd(uint64_t next_with_shift) {
765
+ // Assuming certain values never used, suffices to check this one bit
766
+ constexpr auto kCheckBit = kNextEndFlags ^ kHeadLocked;
767
+ return next_with_shift & kCheckBit;
768
+ }
769
+
770
+ // Bottom bits to right shift away to get an array index from a
771
+ // "with_shift" pointer.
772
+ static constexpr int kNextShift = 8;
773
+
774
+ // A bit mask for the "shift" associated with each "with_shift" pointer.
775
+ // Always bottommost bits.
776
+ static constexpr int kShiftMask = 63;
777
+
778
+ // A marker for head_next_with_shift that indicates this HandleImpl is
779
+ // heap allocated (standalone) rather than in the table.
780
+ static constexpr uint64_t kStandaloneMarker = UINT64_MAX;
781
+
782
+ // A marker for head_next_with_shift indicating the head is not yet part
783
+ // of the usable table, or for chain_next_with_shift indicating that the
784
+ // entry is not present or is not yet part of a chain (must not be
785
+ // "shareable" state).
786
+ static constexpr uint64_t kUnusedMarker = 0;
787
+
788
+ // See above. The head pointer is logically independent of the rest of
789
+ // the entry, including the chain next pointer.
790
+ std::atomic<uint64_t> head_next_with_shift{kUnusedMarker};
791
+ std::atomic<uint64_t> chain_next_with_shift{kUnusedMarker};
792
+
793
+ // For supporting CreateStandalone and some fallback cases.
794
+ inline bool IsStandalone() const {
795
+ return head_next_with_shift.load(std::memory_order_acquire) ==
796
+ kStandaloneMarker;
797
+ }
798
+
799
+ inline void SetStandalone() {
800
+ head_next_with_shift.store(kStandaloneMarker, std::memory_order_release);
801
+ }
802
+ }; // struct HandleImpl
547
803
 
548
- // A reference to Cache::eviction_callback_
549
- const Cache::EvictionCallback& eviction_callback_;
804
+ struct Opts {
805
+ explicit Opts(size_t _min_avg_value_size)
806
+ : min_avg_value_size(_min_avg_value_size) {}
807
+
808
+ explicit Opts(const HyperClockCacheOptions& opts) {
809
+ assert(opts.estimated_entry_charge == 0);
810
+ min_avg_value_size = opts.min_avg_entry_charge;
811
+ }
812
+ size_t min_avg_value_size;
813
+ };
550
814
 
551
- // A reference to ShardedCacheBase::hash_seed_
552
- const uint32_t& hash_seed_;
815
+ AutoHyperClockTable(size_t capacity, bool strict_capacity_limit,
816
+ CacheMetadataChargePolicy metadata_charge_policy,
817
+ MemoryAllocator* allocator,
818
+ const Cache::EvictionCallback* eviction_callback,
819
+ const uint32_t* hash_seed, const Opts& opts);
820
+ ~AutoHyperClockTable();
821
+
822
+ // For BaseClockTable::Insert
823
+ struct InsertState {
824
+ uint64_t saved_length_info = 0;
825
+ size_t likely_empty_slot = 0;
826
+ };
553
827
 
554
- // We partition the following members into different cache lines
555
- // to avoid false sharing among Lookup, Release, Erase and Insert
556
- // operations in ClockCacheShard.
828
+ void StartInsert(InsertState& state);
557
829
 
558
- ALIGN_AS(CACHE_LINE_SIZE)
559
- // Clock algorithm sweep pointer.
560
- std::atomic<uint64_t> clock_pointer_{};
830
+ // Does initial check for whether there's hash table room for another
831
+ // inserted entry, possibly growing if needed. Returns true iff (after
832
+ // the call) there is room for the proposed number of entries.
833
+ bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
561
834
 
562
- ALIGN_AS(CACHE_LINE_SIZE)
563
- // Number of elements in the table.
564
- std::atomic<size_t> occupancy_{};
835
+ HandleImpl* DoInsert(const ClockHandleBasicData& proto,
836
+ uint64_t initial_countdown, bool take_ref,
837
+ InsertState& state);
565
838
 
566
- // Memory usage by entries tracked by the cache (including standalone)
567
- std::atomic<size_t> usage_{};
839
+ // Runs the clock eviction algorithm trying to reclaim at least
840
+ // requested_charge. Returns how much is evicted, which could be less
841
+ // if it appears impossible to evict the requested amount without blocking.
842
+ void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
568
843
 
569
- // Part of usage by standalone entries (not in table)
570
- std::atomic<size_t> standalone_usage_{};
571
- }; // class HyperClockTable
844
+ HandleImpl* Lookup(const UniqueId64x2& hashed_key);
845
+
846
+ bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
847
+
848
+ void Erase(const UniqueId64x2& hashed_key);
849
+
850
+ void EraseUnRefEntries();
851
+
852
+ size_t GetTableSize() const;
853
+
854
+ size_t GetOccupancyLimit() const;
855
+
856
+ const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; }
857
+
858
+ #ifndef NDEBUG
859
+ size_t& TEST_MutableOccupancyLimit() {
860
+ return *reinterpret_cast<size_t*>(&occupancy_limit_);
861
+ }
862
+
863
+ // Release N references
864
+ void TEST_ReleaseN(HandleImpl* handle, size_t n);
865
+ #endif
866
+
867
+ // Maximum ratio of number of occupied slots to number of usable slots. The
868
+ // actual load factor should float pretty close to this number, which should
869
+ // be a nice space/time trade-off, though large swings in WriteBufferManager
870
+ // memory could lead to low (but very much safe) load factors (only after
871
+ // seeing high load factors). Linear hashing along with (modified) linear
872
+ // probing to find an available slot increases potential risks of high
873
+ // load factors, so are disallowed.
874
+ static constexpr double kMaxLoadFactor = 0.60;
875
+
876
+ private: // functions
877
+ // Returns true iff increased usable length. Due to load factor
878
+ // considerations, GrowIfNeeded might call this more than once to make room
879
+ // for one more entry.
880
+ bool Grow(InsertState& state);
881
+
882
+ // Operational details of splitting a chain into two for Grow().
883
+ void SplitForGrow(size_t grow_home, size_t old_home, int old_shift);
884
+
885
+ // Takes an "under construction" entry and ensures it is no longer connected
886
+ // to its home chain (in preparaion for completing erasure and freeing the
887
+ // slot). Note that previous operations might have already noticed it being
888
+ // "under (de)construction" and removed it from its chain.
889
+ void Remove(HandleImpl* h);
890
+
891
+ // Try to take ownership of an entry and erase+remove it from the table.
892
+ // Returns true if successful. Could fail if
893
+ // * There are other references to the entry
894
+ // * Some other thread has exclusive ownership or has freed it.
895
+ bool TryEraseHandle(HandleImpl* h, bool holding_ref, bool mark_invisible);
896
+
897
+ // Calculates the appropriate maximum table size, for creating the memory
898
+ // mapping.
899
+ static size_t CalcMaxUsableLength(
900
+ size_t capacity, size_t min_avg_value_size,
901
+ CacheMetadataChargePolicy metadata_charge_policy);
902
+
903
+ // Shared helper function that implements removing entries from a chain
904
+ // with proper handling to ensure all existing data is seen even in the
905
+ // presence of concurrent insertions, etc. (See implementation.)
906
+ template <class OpData>
907
+ void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX);
908
+
909
+ // An RAII wrapper for locking a chain of entries for removals. See
910
+ // implementation.
911
+ class ChainRewriteLock;
912
+
913
+ // Helper function for PurgeImpl while holding a ChainRewriteLock. See
914
+ // implementation.
915
+ template <class OpData>
916
+ void PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock,
917
+ size_t home);
918
+
919
+ // Update length_info_ as much as possible without waiting, given a known
920
+ // usable (ready for inserts and lookups) grow_home. (Previous grow_homes
921
+ // might not be usable yet, but we can check if they are by looking at
922
+ // the corresponding old home.)
923
+ void CatchUpLengthInfoNoWait(size_t known_usable_grow_home);
924
+
925
+ private: // data
926
+ // mmaped area holding handles
927
+ const TypedMemMapping<HandleImpl> array_;
928
+
929
+ // Metadata for table size under linear hashing.
930
+ //
931
+ // Lowest 8 bits are the minimum number of lowest hash bits to use
932
+ // ("min shift"). The upper 56 bits are a threshold. If that minumum number
933
+ // of bits taken from a hash value is < this threshold, then one more bit of
934
+ // hash value is taken and used.
935
+ //
936
+ // Other mechanisms (shift amounts on pointers) ensure complete availability
937
+ // of data already in the table even if a reader only sees a completely
938
+ // out-of-date version of this value. In the worst case, it could take
939
+ // log time to find the correct chain, but normally this value enables
940
+ // readers to find the correct chain on the first try.
941
+ //
942
+ // NOTES: length_info_ is only updated at the end of a Grow operation,
943
+ // so that waiting in Grow operations isn't done while entries are pinned
944
+ // for internal operation purposes. Thus, Lookup and Insert have to
945
+ // detect and support cases where length_info hasn't caught up to updated
946
+ // chains. Winning grow thread is the one that transitions
947
+ // head_next_with_shift from zeros. Grow threads can spin/yield wait for
948
+ // preconditions and postconditions to be met.
949
+ std::atomic<uint64_t> length_info_;
950
+
951
+ // An already-computed version of the usable length times the max load
952
+ // factor. Could be slightly out of date but GrowIfNeeded()/Grow() handle
953
+ // that internally.
954
+ std::atomic<size_t> occupancy_limit_;
955
+
956
+ // See explanation in AutoHyperClockTable::Evict
957
+ std::atomic<size_t> clock_pointer_mask_;
958
+ }; // class AutoHyperClockTable
572
959
 
573
960
  // A single shard of sharded cache.
574
- template <class Table>
961
+ template <class TableT>
575
962
  class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
576
963
  public:
964
+ using Table = TableT;
577
965
  ClockCacheShard(size_t capacity, bool strict_capacity_limit,
578
966
  CacheMetadataChargePolicy metadata_charge_policy,
579
967
  MemoryAllocator* allocator,
@@ -666,9 +1054,17 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
666
1054
  return Lookup(key, hashed_key);
667
1055
  }
668
1056
 
1057
+ Table& GetTable() { return table_; }
1058
+ const Table& GetTable() const { return table_; }
1059
+
1060
+ #ifndef NDEBUG
1061
+ size_t& TEST_MutableOccupancyLimit() {
1062
+ return table_.TEST_MutableOccupancyLimit();
1063
+ }
669
1064
  // Acquire/release N references
670
1065
  void TEST_RefN(HandleImpl* handle, size_t n);
671
1066
  void TEST_ReleaseN(HandleImpl* handle, size_t n);
1067
+ #endif
672
1068
 
673
1069
  private: // data
674
1070
  Table table_;
@@ -680,17 +1076,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
680
1076
  std::atomic<bool> strict_capacity_limit_;
681
1077
  }; // class ClockCacheShard
682
1078
 
683
- class HyperClockCache
684
- #ifdef NDEBUG
685
- final
686
- #endif
687
- : public ShardedCache<ClockCacheShard<HyperClockTable>> {
1079
+ template <class Table>
1080
+ class BaseHyperClockCache : public ShardedCache<ClockCacheShard<Table>> {
688
1081
  public:
689
- using Shard = ClockCacheShard<HyperClockTable>;
690
-
691
- explicit HyperClockCache(const HyperClockCacheOptions& opts);
1082
+ using Shard = ClockCacheShard<Table>;
1083
+ using Handle = Cache::Handle;
1084
+ using CacheItemHelper = Cache::CacheItemHelper;
692
1085
 
693
- const char* Name() const override { return "HyperClockCache"; }
1086
+ explicit BaseHyperClockCache(const HyperClockCacheOptions& opts);
694
1087
 
695
1088
  Cache::ObjectPtr Value(Handle* handle) override;
696
1089
 
@@ -700,7 +1093,35 @@ class HyperClockCache
700
1093
 
701
1094
  void ReportProblems(
702
1095
  const std::shared_ptr<Logger>& /*info_log*/) const override;
703
- }; // class HyperClockCache
1096
+ };
1097
+
1098
+ class FixedHyperClockCache
1099
+ #ifdef NDEBUG
1100
+ final
1101
+ #endif
1102
+ : public BaseHyperClockCache<FixedHyperClockTable> {
1103
+ public:
1104
+ using BaseHyperClockCache::BaseHyperClockCache;
1105
+
1106
+ const char* Name() const override { return "FixedHyperClockCache"; }
1107
+
1108
+ void ReportProblems(
1109
+ const std::shared_ptr<Logger>& /*info_log*/) const override;
1110
+ }; // class FixedHyperClockCache
1111
+
1112
+ class AutoHyperClockCache
1113
+ #ifdef NDEBUG
1114
+ final
1115
+ #endif
1116
+ : public BaseHyperClockCache<AutoHyperClockTable> {
1117
+ public:
1118
+ using BaseHyperClockCache::BaseHyperClockCache;
1119
+
1120
+ const char* Name() const override { return "AutoHyperClockCache"; }
1121
+
1122
+ void ReportProblems(
1123
+ const std::shared_ptr<Logger>& /*info_log*/) const override;
1124
+ }; // class AutoHyperClockCache
704
1125
 
705
1126
  } // namespace clock_cache
706
1127