@nxtedition/rocksdb 13.1.4 → 13.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/binding.cc +43 -16
  2. package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
  4. package/deps/rocksdb/rocksdb/Makefile +2 -2
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
  7. package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
  8. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
  9. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
  10. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
  11. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
  12. package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
  13. package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
  14. package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
  15. package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
  16. package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
  39. package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
  40. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
  41. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
  42. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
  52. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
  53. package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
  54. package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
  55. package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
  56. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
  57. package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
  58. package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
  59. package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
  60. package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
  61. package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
  62. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  63. package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
  65. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
  66. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
  67. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
  68. package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
  69. package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
  70. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
  71. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
  72. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
  73. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
  75. package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
  76. package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
  77. package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
  78. package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
  79. package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
  80. package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
  81. package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
  82. package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
  83. package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
  84. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
  85. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
  86. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
  87. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
  88. package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
  89. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
  90. package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
  91. package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
  92. package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
  93. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
  94. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  95. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
  96. package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
  97. package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
  98. package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
  99. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
  100. package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
  101. package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
  102. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
  103. package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
  104. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
  105. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
  106. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
  107. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
  108. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
  109. package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
  110. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
  111. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
  113. package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
  114. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
  115. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
  118. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
  119. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
  120. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
  121. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
  122. package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
  123. package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
  124. package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
  125. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  126. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
  127. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
  128. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
  129. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
  130. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
  131. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
  132. package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
  134. package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
  135. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
  136. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
  137. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
  139. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
  140. package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
  142. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
  143. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
  144. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
  145. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
  147. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
  148. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
  149. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
  150. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
  151. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
  152. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  153. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
  154. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
  155. package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
  156. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
  157. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
  158. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
  159. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
  160. package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
  161. package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
  162. package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
  163. package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
  164. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
  165. package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
  166. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
  167. package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
  168. package/deps/rocksdb/rocksdb/port/port.h +5 -9
  169. package/deps/rocksdb/rocksdb/src.mk +8 -0
  170. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
  171. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
  172. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
  173. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
  174. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
  175. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
  176. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
  177. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
  178. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
  179. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
  180. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
  181. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
  182. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  183. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
  184. package/deps/rocksdb/rocksdb/table/format.cc +3 -3
  185. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
  186. package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
  187. package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
  188. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
  189. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  190. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
  191. package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
  192. package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
  193. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
  194. package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
  195. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
  196. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
  197. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
  198. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
  199. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
  200. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
  201. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
  202. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
  203. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
  204. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
  205. package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
  206. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
  207. package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
  208. package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
  209. package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
  210. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
  211. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
  212. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
  213. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
  214. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
  215. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
  216. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
  217. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
  218. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
  219. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
  220. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
  221. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
  222. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
  223. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
  224. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
  225. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
  226. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
  227. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
  228. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
  229. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
  230. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
  231. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
  232. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
  233. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
  234. package/deps/rocksdb/rocksdb.gyp +2 -0
  235. package/package.json +1 -1
  236. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  237. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -141,8 +141,9 @@ class InlineSkipList {
141
141
  // Returns true iff an entry that compares equal to key is in the list.
142
142
  bool Contains(const char* key) const;
143
143
 
144
- // Return estimated number of entries smaller than `key`.
145
- uint64_t EstimateCount(const char* key) const;
144
+ // Return estimated number of entries from `start_ikey` to `end_ikey`.
145
+ uint64_t ApproximateNumEntries(const Slice& start_ikey,
146
+ const Slice& end_ikey) const;
146
147
 
147
148
  // Validate correctness of the skip-list.
148
149
  void TEST_Validate() const;
@@ -673,31 +674,88 @@ InlineSkipList<Comparator>::FindRandomEntry() const {
673
674
  }
674
675
 
675
676
  template <class Comparator>
676
- uint64_t InlineSkipList<Comparator>::EstimateCount(const char* key) const {
677
+ uint64_t InlineSkipList<Comparator>::ApproximateNumEntries(
678
+ const Slice& start_ikey, const Slice& end_ikey) const {
679
+ // The number of entries at a given level for the given range, in terms of
680
+ // the actual number of entries in that range (level 0), follows a binomial
681
+ // distribution, which is very well approximated by the Poisson distribution.
682
+ // That has stddev sqrt(x) where x is the expected number of entries (mean)
683
+ // at this level, and the best predictor of x is the number of observed
684
+ // entries (at this level). To predict the number of entries on level 0 we use
685
+ // x * kBranchinng ^ level. From the standard deviation, the P99+ relative
686
+ // error is roughly 3 * sqrt(x) / x. Thus, a reasonable approach would be to
687
+ // find the smallest level with at least some moderate constant number entries
688
+ // in range. E.g. with at least ~40 entries, we expect P99+ relative error
689
+ // (approximation accuracy) of ~ 50% = 3 * sqrt(40) / 40; P95 error of
690
+ // ~30%; P75 error of < 20%.
691
+ //
692
+ // However, there are two issues with this approach, and an observation:
693
+ // * Pointer chasing on the larger (bottom) levels is much slower because of
694
+ // cache hierarchy effects, so when the result is smaller, getting the result
695
+ // will be substantially slower, despite traversing a similar number of
696
+ // entries. (We could be clever about pipelining our pointer chasing but
697
+ // that's complicated.)
698
+ // * The larger (bottom) levels also have lower variance because there's a
699
+ // chance (or certainty) that we reach level 0 and return the exact answer.
700
+ // * For applications in query planning, we can also tolerate more variance on
701
+ // small results because the impact of misestimating is likely smaller.
702
+ //
703
+ // These factors point us to an approach in which we have a higher minimum
704
+ // threshold number of samples for higher levels and lower for lower levels
705
+ // (see sufficient_samples below). This seems to yield roughly consistent
706
+ // relative error (stddev around 20%, less for large results) and roughly
707
+ // consistent query time around the time of two memtable point queries.
708
+ //
709
+ // Engineering observation: it is tempting to think that taking into account
710
+ // what we already found in how many entries occur on higher levels, not just
711
+ // the first iterated level with a sufficient number of samples, would yield
712
+ // a more accurate estimate. But that doesn't work because of the particular
713
+ // correlations and independences of the data: each level higher is just an
714
+ // independently probabilistic filtering of the level below it. That
715
+ // filtering from level l to l+1 has no more information about levels
716
+ // 0 .. l-1 than we can get from level l. The structure of RandomHeight() is
717
+ // a clue to these correlations and independences.
718
+
719
+ Node* lb = head_;
720
+ Node* ub = nullptr;
677
721
  uint64_t count = 0;
678
-
679
- Node* x = head_;
680
- int level = GetMaxHeight() - 1;
681
- const DecodedKey key_decoded = compare_.decode_key(key);
682
- while (true) {
683
- assert(x == head_ || compare_(x->Key(), key_decoded) < 0);
684
- Node* next = x->Next(level);
685
- if (next != nullptr) {
686
- PREFETCH(next->Next(level), 0, 1);
722
+ for (int level = GetMaxHeight() - 1; level >= 0; level--) {
723
+ auto sufficient_samples = static_cast<uint64_t>(level) * kBranching_ + 10U;
724
+ if (count >= sufficient_samples) {
725
+ // No more counting; apply powers of kBranching and avoid floating point
726
+ count *= kBranching_;
727
+ continue;
687
728
  }
688
- if (next == nullptr || compare_(next->Key(), key_decoded) >= 0) {
689
- if (level == 0) {
690
- return count;
691
- } else {
692
- // Switch to next list
693
- count *= kBranching_;
694
- level--;
729
+ count = 0;
730
+ Node* next;
731
+ // Get a more precise lower bound (for start key)
732
+ for (;;) {
733
+ next = lb->Next(level);
734
+ if (next == ub) {
735
+ break;
736
+ }
737
+ assert(next != nullptr);
738
+ if (compare_(next->Key(), start_ikey) >= 0) {
739
+ break;
740
+ }
741
+ lb = next;
742
+ }
743
+ // Count entries on this level until upper bound (for end key)
744
+ for (;;) {
745
+ if (next == ub) {
746
+ break;
747
+ }
748
+ assert(next != nullptr);
749
+ if (compare_(next->Key(), end_ikey) >= 0) {
750
+ // Save refined upper bound to potentially save key comparison
751
+ ub = next;
752
+ break;
695
753
  }
696
- } else {
697
- x = next;
698
754
  count++;
755
+ next = next->Next(level);
699
756
  }
700
757
  }
758
+ return count;
701
759
  }
702
760
 
703
761
  template <class Comparator>
@@ -64,8 +64,9 @@ class SkipList {
64
64
  // Returns true iff an entry that compares equal to key is in the list.
65
65
  bool Contains(const Key& key) const;
66
66
 
67
- // Return estimated number of entries smaller than `key`.
68
- uint64_t EstimateCount(const Key& key) const;
67
+ // Return estimated number of entries from `start_ikey` to `end_ikey`.
68
+ uint64_t ApproximateNumEntries(const Slice& start_ikey,
69
+ const Slice& end_ikey) const;
69
70
 
70
71
  // Iteration over the contents of a skip list
71
72
  class Iterator {
@@ -383,27 +384,49 @@ typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
383
384
  }
384
385
 
385
386
  template <typename Key, class Comparator>
386
- uint64_t SkipList<Key, Comparator>::EstimateCount(const Key& key) const {
387
+ uint64_t SkipList<Key, Comparator>::ApproximateNumEntries(
388
+ const Slice& start_ikey, const Slice& end_ikey) const {
389
+ // See InlineSkipList<Comparator>::ApproximateNumEntries() (copy-paste)
390
+ Node* lb = head_;
391
+ Node* ub = nullptr;
387
392
  uint64_t count = 0;
388
-
389
- Node* x = head_;
390
- int level = GetMaxHeight() - 1;
391
- while (true) {
392
- assert(x == head_ || compare_(x->key, key) < 0);
393
- Node* next = x->Next(level);
394
- if (next == nullptr || compare_(next->key, key) >= 0) {
395
- if (level == 0) {
396
- return count;
397
- } else {
398
- // Switch to next list
399
- count *= kBranching_;
400
- level--;
393
+ for (int level = GetMaxHeight() - 1; level >= 0; level--) {
394
+ auto sufficient_samples = static_cast<uint64_t>(level) * kBranching_ + 10U;
395
+ if (count >= sufficient_samples) {
396
+ // No more counting; apply powers of kBranching and avoid floating point
397
+ count *= kBranching_;
398
+ continue;
399
+ }
400
+ count = 0;
401
+ Node* next;
402
+ // Get a more precise lower bound (for start key)
403
+ for (;;) {
404
+ next = lb->Next(level);
405
+ if (next == ub) {
406
+ break;
407
+ }
408
+ assert(next != nullptr);
409
+ if (compare_(next->Key(), start_ikey) >= 0) {
410
+ break;
411
+ }
412
+ lb = next;
413
+ }
414
+ // Count entries on this level until upper bound (for end key)
415
+ for (;;) {
416
+ if (next == ub) {
417
+ break;
418
+ }
419
+ assert(next != nullptr);
420
+ if (compare_(next->Key(), end_ikey) >= 0) {
421
+ // Save refined upper bound to potentially save key comparison
422
+ ub = next;
423
+ break;
401
424
  }
402
- } else {
403
- x = next;
404
425
  count++;
426
+ next = next->Next(level);
405
427
  }
406
428
  }
429
+ return count;
407
430
  }
408
431
 
409
432
  template <typename Key, class Comparator>
@@ -108,11 +108,7 @@ class SkipListRep : public MemTableRep {
108
108
 
109
109
  uint64_t ApproximateNumEntries(const Slice& start_ikey,
110
110
  const Slice& end_ikey) override {
111
- std::string tmp;
112
- uint64_t start_count =
113
- skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey));
114
- uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey));
115
- return (end_count >= start_count) ? (end_count - start_count) : 0;
111
+ return skip_list_.ApproximateNumEntries(start_ikey, end_ikey);
116
112
  }
117
113
 
118
114
  void UniqueRandomSample(const uint64_t num_entries,
@@ -0,0 +1,169 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // This source code is licensed under both the GPLv2 (found in the
3
+ // COPYING file in the root directory) and Apache 2.0 License
4
+ // (found in the LICENSE.Apache file in the root directory).
5
+
6
+ #include "memtable/wbwi_memtable.h"
7
+
8
+ #include "db/memtable.h"
9
+
10
+ namespace ROCKSDB_NAMESPACE {
11
+
12
+ const std::unordered_map<WriteType, ValueType>
13
+ WBWIMemTableIterator::WriteTypeToValueTypeMap = {
14
+ {kPutRecord, kTypeValue},
15
+ {kMergeRecord, kTypeMerge},
16
+ {kDeleteRecord, kTypeDeletion},
17
+ {kSingleDeleteRecord, kTypeSingleDeletion},
18
+ {kDeleteRangeRecord, kTypeRangeDeletion},
19
+ {kPutEntityRecord, kTypeWideColumnEntity},
20
+ // Only the above record types are added to WBWI.
21
+ // kLogDataRecord, kXIDRecord, kUnknownRecord
22
+ };
23
+
24
+ InternalIterator* WBWIMemTable::NewIterator(
25
+ const ReadOptions&, UnownedPtr<const SeqnoToTimeMapping>, Arena* arena,
26
+ const SliceTransform* /* prefix_extractor */, bool for_flush) {
27
+ // Ingested WBWIMemTable should have an assigned seqno
28
+ assert(assigned_seqno_.upper_bound != kMaxSequenceNumber);
29
+ assert(assigned_seqno_.lower_bound != kMaxSequenceNumber);
30
+ assert(arena);
31
+ auto mem = arena->AllocateAligned(sizeof(WBWIMemTableIterator));
32
+ return new (mem) WBWIMemTableIterator(
33
+ std::unique_ptr<WBWIIterator>(wbwi_->NewIterator(cf_id_)),
34
+ assigned_seqno_, comparator_, for_flush);
35
+ }
36
+
37
+ inline InternalIterator* WBWIMemTable::NewIterator() const {
38
+ assert(assigned_seqno_.upper_bound != kMaxSequenceNumber);
39
+ assert(assigned_seqno_.lower_bound != kMaxSequenceNumber);
40
+ return new WBWIMemTableIterator(
41
+ std::unique_ptr<WBWIIterator>(wbwi_->NewIterator(cf_id_)),
42
+ assigned_seqno_, comparator_, /*for_flush=*/false);
43
+ }
44
+
45
+ bool WBWIMemTable::Get(const LookupKey& key, std::string* value,
46
+ PinnableWideColumns* columns, std::string* timestamp,
47
+ Status* s, MergeContext* merge_context,
48
+ SequenceNumber* max_covering_tombstone_seq,
49
+ SequenceNumber* out_seq, const ReadOptions&,
50
+ bool immutable_memtable, ReadCallback* callback,
51
+ bool* is_blob_index, bool do_merge) {
52
+ (void)immutable_memtable;
53
+ (void)timestamp;
54
+ (void)columns;
55
+ assert(immutable_memtable);
56
+ assert(!timestamp); // TODO: support UDT
57
+ assert(!columns); // TODO: support WideColumn
58
+ assert(assigned_seqno_.upper_bound != kMaxSequenceNumber);
59
+ assert(assigned_seqno_.lower_bound != kMaxSequenceNumber);
60
+ // WBWI does not support DeleteRange yet.
61
+ assert(!wbwi_->GetWriteBatch()->HasDeleteRange());
62
+
63
+ [[maybe_unused]] SequenceNumber read_seq =
64
+ GetInternalKeySeqno(key.internal_key());
65
+ std::unique_ptr<InternalIterator> iter{NewIterator()};
66
+ iter->Seek(key.internal_key());
67
+ const Slice lookup_user_key = key.user_key();
68
+
69
+ while (iter->Valid() && comparator_->EqualWithoutTimestamp(
70
+ ExtractUserKey(iter->key()), lookup_user_key)) {
71
+ uint64_t tag = ExtractInternalKeyFooter(iter->key());
72
+ ValueType type;
73
+ SequenceNumber seq;
74
+ UnPackSequenceAndType(tag, &seq, &type);
75
+ // Unsupported operations.
76
+ assert(type != kTypeBlobIndex);
77
+ assert(type != kTypeWideColumnEntity);
78
+ assert(type != kTypeValuePreferredSeqno);
79
+ assert(type != kTypeDeletionWithTimestamp);
80
+ assert(type != kTypeMerge);
81
+ if (!callback || callback->IsVisible(seq)) {
82
+ if (*out_seq == kMaxSequenceNumber) {
83
+ *out_seq = std::max(seq, *max_covering_tombstone_seq);
84
+ }
85
+ if (*max_covering_tombstone_seq > seq) {
86
+ type = kTypeRangeDeletion;
87
+ }
88
+ switch (type) {
89
+ case kTypeValue: {
90
+ HandleTypeValue(lookup_user_key, iter->value(), iter->IsValuePinned(),
91
+ do_merge, s->IsMergeInProgress(), merge_context,
92
+ moptions_.merge_operator, clock_,
93
+ moptions_.statistics, moptions_.info_log, s, value,
94
+ columns, is_blob_index);
95
+ assert(seq <= read_seq);
96
+ return /*found_final_value=*/true;
97
+ }
98
+ case kTypeDeletion:
99
+ case kTypeSingleDeletion:
100
+ case kTypeRangeDeletion: {
101
+ HandleTypeDeletion(lookup_user_key, s->IsMergeInProgress(),
102
+ merge_context, moptions_.merge_operator, clock_,
103
+ moptions_.statistics, moptions_.info_log, s, value,
104
+ columns);
105
+ assert(seq <= read_seq);
106
+ return /*found_final_value=*/true;
107
+ }
108
+ default: {
109
+ std::string msg("Unrecognized or unsupported value type: " +
110
+ std::to_string(static_cast<int>(type)) + ". ");
111
+ msg.append("User key: " +
112
+ ExtractUserKey(iter->key()).ToString(/*hex=*/true) + ". ");
113
+ msg.append("seq: " + std::to_string(seq) + ".");
114
+ *s = Status::Corruption(msg.c_str());
115
+ return /*found_final_value=*/true;
116
+ }
117
+ }
118
+ }
119
+ // Current key not visible or we read a merge key
120
+ assert(s->IsMergeInProgress() || (callback && !callback->IsVisible(seq)));
121
+ iter->Next();
122
+ }
123
+ if (!iter->status().ok() &&
124
+ (s->ok() || s->IsMergeInProgress() || s->IsNotFound())) {
125
+ *s = iter->status();
126
+ // stop further look up
127
+ return true;
128
+ }
129
+ return /*found_final_value=*/false;
130
+ }
131
+
132
+ void WBWIMemTable::MultiGet(const ReadOptions& read_options,
133
+ MultiGetRange* range, ReadCallback* callback,
134
+ bool immutable_memtable) {
135
+ (void)immutable_memtable;
136
+ // Should only be used as immutable memtable.
137
+ assert(immutable_memtable);
138
+ // TODO: reuse the InternalIterator created in Get().
139
+ for (auto iter = range->begin(); iter != range->end(); ++iter) {
140
+ SequenceNumber dummy_seq = 0;
141
+ bool found_final_value =
142
+ Get(*iter->lkey, iter->value ? iter->value->GetSelf() : nullptr,
143
+ iter->columns, iter->timestamp, iter->s, &(iter->merge_context),
144
+ &(iter->max_covering_tombstone_seq), &dummy_seq, read_options, true,
145
+ callback, nullptr, true);
146
+ if (found_final_value) {
147
+ if (iter->s->ok() || iter->s->IsNotFound()) {
148
+ if (iter->value) {
149
+ iter->value->PinSelf();
150
+ range->AddValueSize(iter->value->size());
151
+ } else {
152
+ assert(iter->columns);
153
+ range->AddValueSize(iter->columns->serialized_size());
154
+ }
155
+ }
156
+ range->MarkKeyDone(iter);
157
+ if (range->GetValueSize() > read_options.value_size_soft_limit) {
158
+ // Set all remaining keys in range to Abort
159
+ for (auto range_iter = range->begin(); range_iter != range->end();
160
+ ++range_iter) {
161
+ range->MarkKeyDone(range_iter);
162
+ *(range_iter->s) = Status::Aborted();
163
+ }
164
+ break;
165
+ }
166
+ }
167
+ }
168
+ }
169
+ } // namespace ROCKSDB_NAMESPACE