datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,48 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(sampling INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::SAMPLING ALIAS sampling)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(sampling
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(sampling INTERFACE common)
33
+ target_compile_features(sampling INTERFACE cxx_std_11)
34
+
35
+ set(sampling_HEADERS "include/var_opt_sketch.hpp;include/var_opt_sketch_impl.hpp")
36
+
37
+ install(TARGETS sampling
38
+ EXPORT ${PROJECT_NAME}
39
+ )
40
+
41
+ install(FILES ${sampling_HEADERS}
42
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
43
+
44
+ target_sources(sampling
45
+ INTERFACE
46
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch.hpp
47
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch_impl.hpp
48
+ )
@@ -0,0 +1,392 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _VAR_OPT_SKETCH_HPP_
21
+ #define _VAR_OPT_SKETCH_HPP_
22
+
23
+ #include "serde.hpp"
24
+ #include "common_defs.hpp"
25
+
26
+ #include <iterator>
27
+ #include <vector>
28
+
29
+
30
+ /**
31
+ * This sketch samples data from a stream of items, designed for optimal (minimum) variance when
32
+ * querying the sketch to estimate subset sums of items matchng a provided predicate. Variance
33
+ * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
34
+ * subset sum estimation.
35
+ *
36
+ * author Kevin Lang
37
+ * author Jon Malkin
38
+ */
39
+ namespace datasketches {
40
+
41
+ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
42
+ template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
43
+
44
+ /**
45
+ * A struct to hold the result of subset sum queries
46
+ */
47
+ struct subset_summary {
48
+ double lower_bound;
49
+ double estimate;
50
+ double upper_bound;
51
+ double total_sketch_weight;
52
+ };
53
+
54
+ enum resize_factor { X1 = 0, X2, X4, X8 };
55
+
56
+ template <typename T, typename S, typename A> class var_opt_union; // forward declaration
57
+
58
+ template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
59
+ class var_opt_sketch {
60
+
61
+ public:
62
+ static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
63
+ static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
64
+
65
+ explicit var_opt_sketch(uint32_t k, resize_factor rf = DEFAULT_RESIZE_FACTOR);
66
+ var_opt_sketch(const var_opt_sketch& other);
67
+ var_opt_sketch(var_opt_sketch&& other) noexcept;
68
+
69
+ ~var_opt_sketch();
70
+
71
+ var_opt_sketch& operator=(const var_opt_sketch& other);
72
+ var_opt_sketch& operator=(var_opt_sketch&& other);
73
+
74
+ /**
75
+ * Updates this sketch with the given data item with the given weight.
76
+ * This method takes an lvalue.
77
+ * @param item an item from a stream of items
78
+ * @param weight the weight of the item
79
+ */
80
+ void update(const T& item, double weight=1.0);
81
+
82
+ /**
83
+ * Updates this sketch with the given data item with the given weight.
84
+ * This method takes an rvalue.
85
+ * @param item an item from a stream of items
86
+ * @param weight the weight of the item
87
+ */
88
+ void update(T&& item, double weight=1.0);
89
+
90
+ /**
91
+ * Returns the configured maximum sample size.
92
+ * @return configured maximum sample size
93
+ */
94
+ inline uint32_t get_k() const;
95
+
96
+ /**
97
+ * Returns the length of the input stream.
98
+ * @return stream length
99
+ */
100
+ inline uint64_t get_n() const;
101
+
102
+ /**
103
+ * Returns the number of samples currently in the sketch
104
+ * @return stream length
105
+ */
106
+ inline uint32_t get_num_samples() const;
107
+
108
+ /**
109
+ * Computes an estimated subset sum from the entire stream for objects matching a given
110
+ * predicate. Provides a lower bound, estimate, and upper bound using a target of 2 standard
111
+ * deviations. This is technically a heuristic method and tries to err on the conservative side.
112
+ * @param P a predicate function
113
+ * @return a subset_summary item with estimate, upper and lower bounds,
114
+ * and total sketch weight
115
+ */
116
+ template<typename P>
117
+ subset_summary estimate_subset_sum(P predicate) const;
118
+
119
+ /**
120
+ * Returns true if the sketch is empty.
121
+ * @return empty flag
122
+ */
123
+ inline bool is_empty() const;
124
+
125
+ /**
126
+ * Resets the sketch to its default, empty state.
127
+ */
128
+ void reset();
129
+
130
+ /**
131
+ * Computes size needed to serialize the current state of the sketch.
132
+ * This version is for fixed-size arithmetic types (integral and floating point).
133
+ * @return size in bytes needed to serialize this sketch
134
+ */
135
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
136
+ inline size_t get_serialized_size_bytes() const;
137
+
138
+ /**
139
+ * Computes size needed to serialize the current state of the sketch.
140
+ * This version is for all other types and can be expensive since every item needs to be looked at.
141
+ * @return size in bytes needed to serialize this sketch
142
+ */
143
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
144
+ inline size_t get_serialized_size_bytes() const;
145
+
146
+ // This is a convenience alias for users
147
+ // The type returned by the following serialize method
148
+ typedef vector_u8<A> vector_bytes;
149
+
150
+ /**
151
+ * This method serializes the sketch as a vector of bytes.
152
+ * An optional header can be reserved in front of the sketch.
153
+ * It is a blank space of a given size.
154
+ * This header is used in Datasketches PostgreSQL extension.
155
+ * @param header_size_bytes space to reserve in front of the sketch
156
+ */
157
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
158
+
159
+ /**
160
+ * This method serializes the sketch into a given stream in a binary form
161
+ * @param os output stream
162
+ */
163
+ void serialize(std::ostream& os) const;
164
+
165
+ /**
166
+ * This method deserializes a sketch from a given stream.
167
+ * @param is input stream
168
+ * @return an instance of a sketch
169
+ */
170
+ static var_opt_sketch deserialize(std::istream& is);
171
+
172
+ /**
173
+ * This method deserializes a sketch from a given array of bytes.
174
+ * @param bytes pointer to the array of bytes
175
+ * @param size the size of the array
176
+ * @return an instance of a sketch
177
+ */
178
+ static var_opt_sketch deserialize(const void* bytes, size_t size);
179
+
180
+ /**
181
+ * Prints a summary of the sketch.
182
+ * @return the summary as a string
183
+ */
184
+ string<A> to_string() const;
185
+
186
+ /**
187
+ * Prints the raw sketch items to a string. Calls items_to_stream() internally.
188
+ * Only works for type T with a defined operator<<() and
189
+ * kept separate from to_string() to allow compilation even if
190
+ * T does not have such an operator defined.
191
+ * @return a string with the sketch items
192
+ */
193
+ string<A> items_to_string() const;
194
+
195
+ class const_iterator;
196
+ const_iterator begin() const;
197
+ const_iterator end() const;
198
+
199
+ private:
200
+ typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
201
+ typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
202
+
203
+ static const uint32_t MIN_LG_ARR_ITEMS = 3;
204
+
205
+ static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
206
+ static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
207
+ static const uint8_t PREAMBLE_LONGS_FULL = 4;
208
+ static const uint8_t SER_VER = 2;
209
+ static const uint8_t FAMILY_ID = 13;
210
+ static const uint8_t EMPTY_FLAG_MASK = 4;
211
+ static const uint8_t GADGET_FLAG_MASK = 128;
212
+
213
+ // Number of standard deviations to use for subset sum error bounds
214
+ constexpr static const double DEFAULT_KAPPA = 2.0;
215
+
216
+ // TODO: should probably rearrange a bit to minimize gaps once aligned
217
+ uint32_t k_; // max size of sketch, in items
218
+
219
+ uint32_t h_; // number of items in heap
220
+ uint32_t m_; // number of items in middle region
221
+ uint32_t r_; // number of items in reservoir-like region
222
+
223
+ uint64_t n_; // total number of items processed by sketch
224
+ double total_wt_r_; // total weight of items in reservoir-like area
225
+
226
+ resize_factor rf_; // resize factor
227
+
228
+ uint32_t curr_items_alloc_; // currently allocated array size
229
+ bool filled_data_; // true if we've explciitly set all entries in data_
230
+
231
+ T* data_; // stored sampled items
232
+ double* weights_; // weights for sampled items
233
+
234
+ // The next two fields are hidden from the user because they are part of the state of the
235
+ // unioning algorithm, NOT part of a varopt sketch, or even of a varopt "gadget" (our name for
236
+ // the potentially invalid sketch that is maintained by the unioning algorithm). It would make
237
+ // more sense logically for these fields to be declared in the unioning object (whose entire
238
+ // purpose is storing the state of the unioning algorithm) but for reasons of programming
239
+ // convenience we are currently declaring them here. However, that could change in the future.
240
+
241
+ // Following int is:
242
+ // 1. Zero (for a varopt sketch)
243
+ // 2. Count of marked items in H region, if part of a unioning algo's gadget
244
+ uint32_t num_marks_in_h_;
245
+
246
+ // The following array is absent in a varopt sketch, and notionally present in a gadget
247
+ // (although it really belongs in the unioning object). If the array were to be made explicit,
248
+ // some additional coding would need to be done to ensure that all of the necessary data motion
249
+ // occurs and is properly tracked.
250
+ bool* marks_;
251
+
252
+ // used during deserialization to avoid memork leaks upon errors
253
+ class items_deleter;
254
+ class weights_deleter;
255
+ class marks_deleter;
256
+
257
+ var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget);
258
+ var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
259
+ uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
260
+ std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
261
+ std::unique_ptr<bool, marks_deleter> marks);
262
+
263
+ friend class var_opt_union<T,S,A>;
264
+ var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n);
265
+ var_opt_sketch(T* data, double* weights, size_t len, uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r);
266
+
267
+ string<A> items_to_string(bool print_gap) const;
268
+
269
+ // internal-use-only update
270
+ template<typename O>
271
+ inline void update(O&& item, double weight, bool mark);
272
+
273
+ template<typename O>
274
+ inline void update_warmup_phase(O&& item, double weight, bool mark);
275
+
276
+ template<typename O>
277
+ inline void update_light(O&& item, double weight, bool mark);
278
+
279
+ template<typename O>
280
+ inline void update_heavy_r_eq1(O&& item, double weight, bool mark);
281
+
282
+ template<typename O>
283
+ inline void update_heavy_general(O&& item, double weight, bool mark);
284
+
285
+ inline double get_tau() const;
286
+ inline double peek_min() const;
287
+ inline bool is_marked(uint32_t idx) const;
288
+
289
+ inline uint32_t pick_random_slot_in_r() const;
290
+ inline uint32_t choose_delete_slot(double wt_cand, uint32_t num_cand) const;
291
+ inline uint32_t choose_weighted_delete_slot(double wt_cand, uint32_t num_cand) const;
292
+
293
+ template<typename O>
294
+ inline void push(O&& item, double wt, bool mark);
295
+ inline void transition_from_warmup();
296
+ inline void convert_to_heap();
297
+ inline void restore_towards_leaves(uint32_t slot_in);
298
+ inline void restore_towards_root(uint32_t slot_in);
299
+ inline void pop_min_to_m_region();
300
+ void grow_candidate_set(double wt_cands, uint32_t num_cands);
301
+ void decrease_k_by_1();
302
+ void strip_marks();
303
+ void force_set_k(uint32_t k); // used to resolve union gadget into sketch
304
+ void downsample_candidate_set(double wt_cands, uint32_t num_cands);
305
+ inline void swap_values(uint32_t src, uint32_t dst);
306
+ void grow_data_arrays();
307
+ void allocate_data_arrays(uint32_t tgt_size, bool use_marks);
308
+
309
+ // validation
310
+ static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
311
+ static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
312
+ static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
313
+ uint32_t h, uint32_t r, resize_factor rf);
314
+
315
+ // things to move to common and be shared among sketches
316
+ static uint32_t get_adjusted_size(uint32_t max_size, uint32_t resize_target);
317
+ static uint32_t starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min);
318
+ static inline double pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate);
319
+ static inline double pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate);
320
+ static bool is_power_of_2(uint32_t v);
321
+ static uint32_t to_log_2(uint32_t v);
322
+ static inline uint32_t next_int(uint32_t max_value);
323
+ static inline double next_double_exclude_zero();
324
+
325
+ class iterator;
326
+ };
327
+
328
+ template<typename T, typename S, typename A>
329
+ class var_opt_sketch<T, S, A>::const_iterator : public std::iterator<std::input_iterator_tag, T> {
330
+ public:
331
+ const_iterator(const const_iterator& other);
332
+ const_iterator& operator++();
333
+ const_iterator& operator++(int);
334
+ bool operator==(const const_iterator& other) const;
335
+ bool operator!=(const const_iterator& other) const;
336
+ const std::pair<const T&, const double> operator*() const;
337
+
338
+ private:
339
+ friend class var_opt_sketch<T,S,A>;
340
+ friend class var_opt_union<T,S,A>;
341
+
342
+ // default iterator over full sketch
343
+ const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end);
344
+
345
+ // iterates over only one of the H or R region, optionally applying weight correction
346
+ // to R region (can correct for numerical precision issues)
347
+ const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region);
348
+
349
+ bool get_mark() const;
350
+
351
+ const var_opt_sketch<T,S,A>* sk_;
352
+ double cum_r_weight_; // used for weight correction
353
+ double r_item_wt_;
354
+ size_t idx_;
355
+ const size_t final_idx_;
356
+ bool weight_correction_;
357
+ };
358
+
359
+ // non-const iterator for internal use
360
+ template<typename T, typename S, typename A>
361
+ class var_opt_sketch<T, S, A>::iterator : public std::iterator<std::input_iterator_tag, T> {
362
+ public:
363
+ iterator(const iterator& other);
364
+ iterator& operator++();
365
+ iterator& operator++(int);
366
+ bool operator==(const iterator& other) const;
367
+ bool operator!=(const iterator& other) const;
368
+ std::pair<T&, double> operator*();
369
+
370
+ private:
371
+ friend class var_opt_sketch<T,S,A>;
372
+ friend class var_opt_union<T,S,A>;
373
+
374
+ // iterates over only one of the H or R region, applying weight correction
375
+ // if iterating over R region (can correct for numerical precision issues)
376
+ iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region);
377
+
378
+ bool get_mark() const;
379
+
380
+ const var_opt_sketch<T,S,A>* sk_;
381
+ double cum_r_weight_; // used for weight correction
382
+ double r_item_wt_;
383
+ size_t idx_;
384
+ const size_t final_idx_;
385
+ };
386
+
387
+
388
+ } // namespace datasketches
389
+
390
+ #include "var_opt_sketch_impl.hpp"
391
+
392
+ #endif // _VAR_OPT_SKETCH_HPP_
@@ -0,0 +1,1752 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _VAR_OPT_SKETCH_IMPL_HPP_
21
+ #define _VAR_OPT_SKETCH_IMPL_HPP_
22
+
23
+ #include <memory>
24
+ #include <sstream>
25
+ #include <cmath>
26
+ #include <random>
27
+ #include <algorithm>
28
+
29
+ #include "var_opt_sketch.hpp"
30
+ #include "serde.hpp"
31
+ #include "bounds_binomial_proportions.hpp"
32
+ #include "count_zeros.hpp"
33
+ #include "memory_operations.hpp"
34
+ #include "ceiling_power_of_2.hpp"
35
+
36
+ namespace datasketches {
37
+
38
+ /**
39
+ * Implementation code for the VarOpt sketch.
40
+ *
41
+ * author Kevin Lang
42
+ * author Jon Malkin
43
+ */
44
+ template<typename T, typename S, typename A>
45
+ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf) :
46
+ var_opt_sketch<T,S,A>(k, rf, false) {}
47
+
48
+ template<typename T, typename S, typename A>
49
+ var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other) :
50
+ k_(other.k_),
51
+ h_(other.h_),
52
+ m_(other.m_),
53
+ r_(other.r_),
54
+ n_(other.n_),
55
+ total_wt_r_(other.total_wt_r_),
56
+ rf_(other.rf_),
57
+ curr_items_alloc_(other.curr_items_alloc_),
58
+ filled_data_(other.filled_data_),
59
+ data_(nullptr),
60
+ weights_(nullptr),
61
+ num_marks_in_h_(other.num_marks_in_h_),
62
+ marks_(nullptr)
63
+ {
64
+ data_ = A().allocate(curr_items_alloc_);
65
+ // skip gap or anything unused at the end
66
+ for (size_t i = 0; i < h_; ++i)
67
+ new (&data_[i]) T(other.data_[i]);
68
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
69
+ new (&data_[i]) T(other.data_[i]);
70
+
71
+ // we skipped the gap
72
+ filled_data_ = false;
73
+
74
+ weights_ = AllocDouble().allocate(curr_items_alloc_);
75
+ // doubles so can successfully copy regardless of the internal state
76
+ std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
77
+
78
+ if (other.marks_ != nullptr) {
79
+ marks_ = AllocBool().allocate(curr_items_alloc_);
80
+ std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
81
+ }
82
+ }
83
+
84
+ template<typename T, typename S, typename A>
85
+ var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n) :
86
+ k_(other.k_),
87
+ h_(other.h_),
88
+ m_(other.m_),
89
+ r_(other.r_),
90
+ n_(adjusted_n),
91
+ total_wt_r_(other.total_wt_r_),
92
+ rf_(other.rf_),
93
+ curr_items_alloc_(other.curr_items_alloc_),
94
+ filled_data_(other.filled_data_),
95
+ data_(nullptr),
96
+ weights_(nullptr),
97
+ num_marks_in_h_(other.num_marks_in_h_),
98
+ marks_(nullptr)
99
+ {
100
+ data_ = A().allocate(curr_items_alloc_);
101
+ // skip gap or anything unused at the end
102
+ for (size_t i = 0; i < h_; ++i)
103
+ new (&data_[i]) T(other.data_[i]);
104
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
105
+ new (&data_[i]) T(other.data_[i]);
106
+
107
+ // we skipped the gap
108
+ filled_data_ = false;
109
+
110
+ weights_ = AllocDouble().allocate(curr_items_alloc_);
111
+ // doubles so can successfully copy regardless of the internal state
112
+ std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
113
+
114
+ if (!as_sketch && other.marks_ != nullptr) {
115
+ marks_ = AllocBool().allocate(curr_items_alloc_);
116
+ std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
117
+ }
118
+ }
119
+
120
+ template<typename T, typename S, typename A>
121
+ var_opt_sketch<T,S,A>::var_opt_sketch(T* data, double* weights, size_t len,
122
+ uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r) :
123
+ k_(k),
124
+ h_(h_count),
125
+ m_(0),
126
+ r_(r_count),
127
+ n_(n),
128
+ total_wt_r_(total_wt_r),
129
+ rf_(DEFAULT_RESIZE_FACTOR),
130
+ curr_items_alloc_(len),
131
+ filled_data_(n > k),
132
+ data_(data),
133
+ weights_(weights),
134
+ num_marks_in_h_(0),
135
+ marks_(nullptr)
136
+ {}
137
+
138
+ template<typename T, typename S, typename A>
139
+ var_opt_sketch<T,S,A>::var_opt_sketch(var_opt_sketch&& other) noexcept :
140
+ k_(other.k_),
141
+ h_(other.h_),
142
+ m_(other.m_),
143
+ r_(other.r_),
144
+ n_(other.n_),
145
+ total_wt_r_(other.total_wt_r_),
146
+ rf_(other.rf_),
147
+ curr_items_alloc_(other.curr_items_alloc_),
148
+ filled_data_(other.filled_data_),
149
+ data_(other.data_),
150
+ weights_(other.weights_),
151
+ num_marks_in_h_(other.num_marks_in_h_),
152
+ marks_(other.marks_)
153
+ {
154
+ other.data_ = nullptr;
155
+ other.weights_ = nullptr;
156
+ other.marks_ = nullptr;
157
+ }
158
+
159
+ template<typename T, typename S, typename A>
160
+ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget) :
161
+ k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf) {
162
+ if (k == 0 || k_ > MAX_K) {
163
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
164
+ }
165
+
166
+ uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
167
+ uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS);
168
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
169
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
170
+ ++curr_items_alloc_;
171
+ }
172
+
173
+ allocate_data_arrays(curr_items_alloc_, is_gadget);
174
+ num_marks_in_h_ = 0;
175
+ }
176
+
177
+ template<typename T, typename S, typename A>
178
+ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
179
+ uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
180
+ std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
181
+ std::unique_ptr<bool, marks_deleter> marks) :
182
+ k_(k),
183
+ h_(h),
184
+ m_(m),
185
+ r_(r),
186
+ n_(n),
187
+ total_wt_r_(total_wt_r),
188
+ rf_(rf),
189
+ curr_items_alloc_(curr_items_alloc),
190
+ filled_data_(filled_data),
191
+ data_(items.release()),
192
+ weights_(weights.release()),
193
+ num_marks_in_h_(num_marks_in_h),
194
+ marks_(marks.release())
195
+ {}
196
+
197
+
198
+ template<typename T, typename S, typename A>
199
+ var_opt_sketch<T,S,A>::~var_opt_sketch() {
200
+ if (data_ != nullptr) {
201
+ if (filled_data_) {
202
+ // destroy everything
203
+ const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_);
204
+ for (size_t i = 0; i < num_to_destroy; ++i) {
205
+ A().destroy(data_ + i);
206
+ }
207
+ } else {
208
+ // skip gap or anything unused at the end
209
+ for (size_t i = 0; i < h_; ++i) {
210
+ A().destroy(data_+ i);
211
+ }
212
+
213
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
214
+ A().destroy(data_ + i);
215
+ }
216
+ }
217
+ A().deallocate(data_, curr_items_alloc_);
218
+ }
219
+
220
+ if (weights_ != nullptr) {
221
+ AllocDouble().deallocate(weights_, curr_items_alloc_);
222
+ }
223
+
224
+ if (marks_ != nullptr) {
225
+ AllocBool().deallocate(marks_, curr_items_alloc_);
226
+ }
227
+ }
228
+
229
+ template<typename T, typename S, typename A>
230
+ var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(const var_opt_sketch& other) {
231
+ var_opt_sketch<T,S,A> sk_copy(other);
232
+ std::swap(k_, sk_copy.k_);
233
+ std::swap(h_, sk_copy.h_);
234
+ std::swap(m_, sk_copy.m_);
235
+ std::swap(r_, sk_copy.r_);
236
+ std::swap(n_, sk_copy.n_);
237
+ std::swap(total_wt_r_, sk_copy.total_wt_r_);
238
+ std::swap(rf_, sk_copy.rf_);
239
+ std::swap(curr_items_alloc_, sk_copy.curr_items_alloc_);
240
+ std::swap(filled_data_, sk_copy.filled_data_);
241
+ std::swap(data_, sk_copy.data_);
242
+ std::swap(weights_, sk_copy.weights_);
243
+ std::swap(num_marks_in_h_, sk_copy.num_marks_in_h_);
244
+ std::swap(marks_, sk_copy.marks_);
245
+ return *this;
246
+ }
247
+
248
+ template<typename T, typename S, typename A>
249
+ var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(var_opt_sketch&& other) {
250
+ std::swap(k_, other.k_);
251
+ std::swap(h_, other.h_);
252
+ std::swap(m_, other.m_);
253
+ std::swap(r_, other.r_);
254
+ std::swap(n_, other.n_);
255
+ std::swap(total_wt_r_, other.total_wt_r_);
256
+ std::swap(rf_, other.rf_);
257
+ std::swap(curr_items_alloc_, other.curr_items_alloc_);
258
+ std::swap(filled_data_, other.filled_data_);
259
+ std::swap(data_, other.data_);
260
+ std::swap(weights_, other.weights_);
261
+ std::swap(num_marks_in_h_, other.num_marks_in_h_);
262
+ std::swap(marks_, other.marks_);
263
+ return *this;
264
+ }
265
+
266
+ /*
267
+ * An empty sketch requires 8 bytes.
268
+ *
269
+ * <pre>
270
+ * Long || Start Byte Adr:
271
+ * Adr:
272
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
273
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
274
+ * </pre>
275
+ *
276
+ * A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are
277
+ * at least k items the sketch uses 32 bytes of preamble.
278
+ *
279
+ * The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent
280
+ * unused preamble bits. The acceptance probability for an item is a double in the range [0,1),
281
+ * limiting us to 53 bits of randomness due to details of the IEEE floating point format. To
282
+ * ensure meaningful probabilities as the items seen count approaches capacity, we intentionally
283
+ * use slightly fewer bits.
284
+ *
285
+ * Following the header are weights for the heavy items, then marks in the event this is a gadget.
286
+ * The serialized items come last.
287
+ *
288
+ * <pre>
289
+ * Long || Start Byte Adr:
290
+ * Adr:
291
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
292
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
293
+ *
294
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
295
+ * 1 ||---------------------------Items Seen Count (N)--------------------------------|
296
+ *
297
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
298
+ * 2 ||-------------Item Count in H---------------|-------Item Count in R-------------|
299
+ *
300
+ * || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
301
+ * 3 ||-------------------------------Total Weight in R-------------------------------|
302
+ * </pre>
303
+ */
304
+
305
+ // implementation for fixed-size arithmetic types (integral and floating point)
306
+ template<typename T, typename S, typename A>
307
+ template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
308
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
309
+ if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
310
+ size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
311
+ num_bytes += h_ * sizeof(double); // weights
312
+ if (marks_ != nullptr) { // marks
313
+ num_bytes += (h_ / 8) + (h_ % 8 > 0);
314
+ }
315
+ num_bytes += (h_ + r_) * sizeof(TT); // the actual items
316
+ return num_bytes;
317
+ }
318
+
319
+ // implementation for all other types
320
+ template<typename T, typename S, typename A>
321
+ template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
322
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
323
+ if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
324
+ size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
325
+ num_bytes += h_ * sizeof(double); // weights
326
+ if (marks_ != nullptr) { // marks
327
+ num_bytes += (h_ / 8) + (h_ % 8 > 0);
328
+ }
329
+ // must iterate over the items
330
+ for (auto& it: *this)
331
+ num_bytes += S().size_of_item(it.first);
332
+ return num_bytes;
333
+ }
334
+
335
+ template<typename T, typename S, typename A>
336
+ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes) const {
337
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
338
+ std::vector<uint8_t, AllocU8<A>> bytes(size);
339
+ uint8_t* ptr = bytes.data() + header_size_bytes;
340
+ uint8_t* end_ptr = ptr + size;
341
+
342
+ bool empty = is_empty();
343
+ uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
344
+ : (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL));
345
+ uint8_t first_byte = (preLongs & 0x3F) | ((static_cast<uint8_t>(rf_)) << 6);
346
+ uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0);
347
+
348
+ if (empty) {
349
+ flags |= EMPTY_FLAG_MASK;
350
+ }
351
+
352
+ // first prelong
353
+ uint8_t ser_ver(SER_VER);
354
+ uint8_t family(FAMILY_ID);
355
+ ptr += copy_to_mem(&first_byte, ptr, sizeof(uint8_t));
356
+ ptr += copy_to_mem(&ser_ver, ptr, sizeof(uint8_t));
357
+ ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
358
+ ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
359
+ ptr += copy_to_mem(&k_, ptr, sizeof(uint32_t));
360
+
361
+ if (!empty) {
362
+ // second and third prelongs
363
+ ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
364
+ ptr += copy_to_mem(&h_, ptr, sizeof(uint32_t));
365
+ ptr += copy_to_mem(&r_, ptr, sizeof(uint32_t));
366
+
367
+ // fourth prelong, if needed
368
+ if (r_ > 0) {
369
+ ptr += copy_to_mem(&total_wt_r_, ptr, sizeof(double));
370
+ }
371
+
372
+ // first h_ weights
373
+ ptr += copy_to_mem(weights_, ptr, h_ * sizeof(double));
374
+
375
+ // first h_ marks as packed bytes iff we have a gadget
376
+ if (marks_ != nullptr) {
377
+ uint8_t val = 0;
378
+ for (uint32_t i = 0; i < h_; ++i) {
379
+ if (marks_[i]) {
380
+ val |= 0x1 << (i & 0x7);
381
+ }
382
+
383
+ if ((i & 0x7) == 0x7) {
384
+ ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
385
+ val = 0;
386
+ }
387
+ }
388
+
389
+ // write out any remaining values
390
+ if ((h_ & 0x7) > 0) {
391
+ ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
392
+ }
393
+ }
394
+
395
+ // write the sample items, skipping the gap. Either h_ or r_ may be 0
396
+ ptr += S().serialize(ptr, end_ptr - ptr, data_, h_);
397
+ ptr += S().serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
398
+ }
399
+
400
+ size_t bytes_written = ptr - bytes.data();
401
+ if (bytes_written != size) {
402
+ throw std::logic_error("serialized size mismatch: " + std::to_string(bytes_written) + " != " + std::to_string(size));
403
+ }
404
+
405
+ return bytes;
406
+ }
407
+
408
+ template<typename T, typename S, typename A>
409
+ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
410
+ const bool empty = (h_ == 0) && (r_ == 0);
411
+
412
+ const uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
413
+ : (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL));
414
+ const uint8_t first_byte = (preLongs & 0x3F) | ((static_cast<uint8_t>(rf_)) << 6);
415
+ uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0);
416
+
417
+ if (empty) {
418
+ flags |= EMPTY_FLAG_MASK;
419
+ }
420
+
421
+ // first prelong
422
+ const uint8_t ser_ver(SER_VER);
423
+ const uint8_t family(FAMILY_ID);
424
+ os.write((char*)&first_byte, sizeof(uint8_t));
425
+ os.write((char*)&ser_ver, sizeof(uint8_t));
426
+ os.write((char*)&family, sizeof(uint8_t));
427
+ os.write((char*)&flags, sizeof(uint8_t));
428
+ os.write((char*)&k_, sizeof(uint32_t));
429
+
430
+ if (!empty) {
431
+ // second and third prelongs
432
+ os.write((char*)&n_, sizeof(uint64_t));
433
+ os.write((char*)&h_, sizeof(uint32_t));
434
+ os.write((char*)&r_, sizeof(uint32_t));
435
+
436
+ // fourth prelong, if needed
437
+ if (r_ > 0) {
438
+ os.write((char*)&total_wt_r_, sizeof(double));
439
+ }
440
+
441
+ // write the first h_ weights
442
+ os.write((char*)weights_, h_ * sizeof(double));
443
+
444
+ // write the first h_ marks as packed bytes iff we have a gadget
445
+ if (marks_ != nullptr) {
446
+ uint8_t val = 0;
447
+ for (uint32_t i = 0; i < h_; ++i) {
448
+ if (marks_[i]) {
449
+ val |= 0x1 << (i & 0x7);
450
+ }
451
+
452
+ if ((i & 0x7) == 0x7) {
453
+ os.write((char*)&val, sizeof(uint8_t));
454
+ val = 0;
455
+ }
456
+ }
457
+
458
+ // write out any remaining values
459
+ if ((h_ & 0x7) > 0) {
460
+ os.write((char*)&val, sizeof(uint8_t));
461
+ }
462
+ }
463
+
464
+ // write the sample items, skipping the gap. Either h_ or r_ may be 0
465
+ S().serialize(os, data_, h_);
466
+ S().serialize(os, &data_[h_ + 1], r_);
467
+ }
468
+ }
469
+
470
+ template<typename T, typename S, typename A>
471
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size) {
472
+ ensure_minimum_memory(size, 8);
473
+ const char* ptr = static_cast<const char*>(bytes);
474
+ const char* base = ptr;
475
+ const char* end_ptr = ptr + size;
476
+ uint8_t first_byte;
477
+ ptr += copy_from_mem(ptr, &first_byte, sizeof(first_byte));
478
+ uint8_t preamble_longs = first_byte & 0x3f;
479
+ resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
480
+ uint8_t serial_version;
481
+ ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
482
+ uint8_t family_id;
483
+ ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
484
+ uint8_t flags;
485
+ ptr += copy_from_mem(ptr, &flags, sizeof(flags));
486
+ uint32_t k;
487
+ ptr += copy_from_mem(ptr, &k, sizeof(k));
488
+
489
+ check_preamble_longs(preamble_longs, flags);
490
+ check_family_and_serialization_version(family_id, serial_version);
491
+ ensure_minimum_memory(size, preamble_longs << 3);
492
+
493
+ const bool is_empty = flags & EMPTY_FLAG_MASK;
494
+ const bool is_gadget = flags & GADGET_FLAG_MASK;
495
+
496
+ if (is_empty) {
497
+ return var_opt_sketch<T,S,A>(k, rf, is_gadget);
498
+ }
499
+
500
+ // second and third prelongs
501
+ uint64_t n;
502
+ uint32_t h, r;
503
+ ptr += copy_from_mem(ptr, &n, sizeof(n));
504
+ ptr += copy_from_mem(ptr, &h, sizeof(h));
505
+ ptr += copy_from_mem(ptr, &r, sizeof(r));
506
+
507
+ const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
508
+
509
+ // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
510
+ double total_wt_r = 0.0;
511
+ if (preamble_longs == PREAMBLE_LONGS_FULL) {
512
+ ptr += copy_from_mem(ptr, &total_wt_r, sizeof(total_wt_r));
513
+ if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
514
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
515
+ "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
516
+ }
517
+ } else {
518
+ total_wt_r = 0.0;
519
+ }
520
+
521
+ // read the first h_ weights, fill in rest of array with -1.0
522
+ check_memory_size(ptr - base + (h * sizeof(double)), size);
523
+ std::unique_ptr<double, weights_deleter> weights(AllocDouble().allocate(array_size), weights_deleter(array_size));
524
+ double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
525
+ ptr += copy_from_mem(ptr, wts, h * sizeof(double));
526
+ for (size_t i = 0; i < h; ++i) {
527
+ if (!(wts[i] > 0.0)) {
528
+ throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
529
+ }
530
+ }
531
+ std::fill(&wts[h], &wts[array_size], -1.0);
532
+
533
+ // read the first h_ marks as packed bytes iff we have a gadget
534
+ uint32_t num_marks_in_h = 0;
535
+ std::unique_ptr<bool, marks_deleter> marks(nullptr, marks_deleter(array_size));
536
+ if (is_gadget) {
537
+ uint8_t val = 0;
538
+ marks = std::unique_ptr<bool, marks_deleter>(AllocBool().allocate(array_size), marks_deleter(array_size));
539
+ const size_t size_marks = (h / 8) + (h % 8 > 0 ? 1 : 0);
540
+ check_memory_size(ptr - base + size_marks, size);
541
+ for (uint32_t i = 0; i < h; ++i) {
542
+ if ((i & 0x7) == 0x0) { // should trigger on first iteration
543
+ ptr += copy_from_mem(ptr, &val, sizeof(val));
544
+ }
545
+ marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
546
+ num_marks_in_h += (marks.get()[i] ? 1 : 0);
547
+ }
548
+ }
549
+
550
+ // read the sample items, skipping the gap. Either h_ or r_ may be 0
551
+ items_deleter deleter(array_size);
552
+ std::unique_ptr<T, items_deleter> items(A().allocate(array_size), deleter);
553
+
554
+ ptr += S().deserialize(ptr, end_ptr - ptr, items.get(), h);
555
+ items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
556
+
557
+ ptr += S().deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
558
+ items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
559
+
560
+ return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
561
+ std::move(items), std::move(weights), num_marks_in_h, std::move(marks));
562
+ }
563
+
564
+ template<typename T, typename S, typename A>
565
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is) {
566
+ uint8_t first_byte;
567
+ is.read((char*)&first_byte, sizeof(first_byte));
568
+ uint8_t preamble_longs = first_byte & 0x3f;
569
+ resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
570
+ uint8_t serial_version;
571
+ is.read((char*)&serial_version, sizeof(serial_version));
572
+ uint8_t family_id;
573
+ is.read((char*)&family_id, sizeof(family_id));
574
+ uint8_t flags;
575
+ is.read((char*)&flags, sizeof(flags));
576
+ uint32_t k;
577
+ is.read((char*)&k, sizeof(k));
578
+
579
+ check_preamble_longs(preamble_longs, flags);
580
+ check_family_and_serialization_version(family_id, serial_version);
581
+
582
+ const bool is_empty = flags & EMPTY_FLAG_MASK;
583
+ const bool is_gadget = flags & GADGET_FLAG_MASK;
584
+
585
+ if (is_empty) {
586
+ if (!is.good())
587
+ throw std::runtime_error("error reading from std::istream");
588
+ else
589
+ return var_opt_sketch<T,S,A>(k, rf, is_gadget);
590
+ }
591
+
592
+ // second and third prelongs
593
+ uint64_t n;
594
+ uint32_t h, r;
595
+ is.read((char*)&n, sizeof(n));
596
+ is.read((char*)&h, sizeof(h));
597
+ is.read((char*)&r, sizeof(r));
598
+
599
+ const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
600
+
601
+ // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
602
+ double total_wt_r = 0.0;
603
+ if (preamble_longs == PREAMBLE_LONGS_FULL) {
604
+ is.read((char*)&total_wt_r, sizeof(total_wt_r));
605
+ if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
606
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
607
+ "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
608
+ }
609
+ } else {
610
+ total_wt_r = 0.0;
611
+ }
612
+
613
+ // read the first h weights, fill remainder with -1.0
614
+ std::unique_ptr<double, weights_deleter> weights(AllocDouble().allocate(array_size), weights_deleter(array_size));
615
+ double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
616
+ is.read((char*)wts, h * sizeof(double));
617
+ for (size_t i = 0; i < h; ++i) {
618
+ if (!(wts[i] > 0.0)) {
619
+ throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
620
+ }
621
+ }
622
+ std::fill(&wts[h], &wts[array_size], -1.0);
623
+
624
+ // read the first h_ marks as packed bytes iff we have a gadget
625
+ uint32_t num_marks_in_h = 0;
626
+ std::unique_ptr<bool, marks_deleter> marks(nullptr, marks_deleter(array_size));
627
+ if (is_gadget) {
628
+ marks = std::unique_ptr<bool, marks_deleter>(AllocBool().allocate(array_size), marks_deleter(array_size));
629
+ uint8_t val = 0;
630
+ for (uint32_t i = 0; i < h; ++i) {
631
+ if ((i & 0x7) == 0x0) { // should trigger on first iteration
632
+ is.read((char*)&val, sizeof(val));
633
+ }
634
+ marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
635
+ num_marks_in_h += (marks.get()[i] ? 1 : 0);
636
+ }
637
+ }
638
+
639
+ // read the sample items, skipping the gap. Either h or r may be 0
640
+ items_deleter deleter(array_size);
641
+ std::unique_ptr<T, items_deleter> items(A().allocate(array_size), deleter);
642
+
643
+ S().deserialize(is, items.get(), h); // aka &data_[0]
644
+ items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
645
+
646
+ S().deserialize(is, &(items.get()[h + 1]), r);
647
+ items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
648
+
649
+ if (!is.good())
650
+ throw std::runtime_error("error reading from std::istream");
651
+
652
+ return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
653
+ std::move(items), std::move(weights), num_marks_in_h, std::move(marks));
654
+ }
655
+
656
+ template<typename T, typename S, typename A>
657
+ bool var_opt_sketch<T,S,A>::is_empty() const {
658
+ return (h_ == 0 && r_ == 0);
659
+ }
660
+
661
+ template<typename T, typename S, typename A>
662
+ void var_opt_sketch<T,S,A>::reset() {
663
+ const uint32_t prev_alloc = curr_items_alloc_;
664
+ const uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
665
+ const uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS);
666
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
667
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
668
+ ++curr_items_alloc_;
669
+ }
670
+
671
+ if (filled_data_) {
672
+ // destroy everything
673
+ const size_t num_to_destroy = std::min(k_ + 1, prev_alloc);
674
+ for (size_t i = 0; i < num_to_destroy; ++i)
675
+ A().destroy(data_ + i);
676
+ } else {
677
+ // skip gap or anything unused at the end
678
+ for (size_t i = 0; i < h_; ++i)
679
+ A().destroy(data_+ i);
680
+
681
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
682
+ A().destroy(data_ + i);
683
+ }
684
+
685
+ if (curr_items_alloc_ < prev_alloc) {
686
+ const bool is_gadget = (marks_ != nullptr);
687
+
688
+ A().deallocate(data_, prev_alloc);
689
+ AllocDouble().deallocate(weights_, prev_alloc);
690
+
691
+ if (marks_ != nullptr)
692
+ AllocBool().deallocate(marks_, prev_alloc);
693
+
694
+ allocate_data_arrays(curr_items_alloc_, is_gadget);
695
+ }
696
+
697
+ n_ = 0;
698
+ h_ = 0;
699
+ m_ = 0;
700
+ r_ = 0;
701
+ num_marks_in_h_ = 0;
702
+ total_wt_r_ = 0.0;
703
+ filled_data_ = false;
704
+ }
705
+
706
+ template<typename T, typename S, typename A>
707
+ uint64_t var_opt_sketch<T,S,A>::get_n() const {
708
+ return n_;
709
+ }
710
+
711
+ template<typename T, typename S, typename A>
712
+ uint32_t var_opt_sketch<T,S,A>::get_k() const {
713
+ return k_;
714
+ }
715
+
716
+ template<typename T, typename S, typename A>
717
+ uint32_t var_opt_sketch<T,S,A>::get_num_samples() const {
718
+ const uint32_t num_in_sketch = h_ + r_;
719
+ return (num_in_sketch < k_ ? num_in_sketch : k_);
720
+ }
721
+
722
+ template<typename T, typename S, typename A>
723
+ void var_opt_sketch<T,S,A>::update(const T& item, double weight) {
724
+ update(item, weight, false);
725
+ }
726
+
727
+ template<typename T, typename S, typename A>
728
+ void var_opt_sketch<T,S,A>::update(T&& item, double weight) {
729
+ update(std::move(item), weight, false);
730
+ }
731
+
732
+ template<typename T, typename S, typename A>
733
+ string<A> var_opt_sketch<T,S,A>::to_string() const {
734
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
735
+ os << "### VarOpt SUMMARY: " << std::endl;
736
+ os << " k : " << k_ << std::endl;
737
+ os << " h : " << h_ << std::endl;
738
+ os << " r : " << r_ << std::endl;
739
+ os << " weight_r : " << total_wt_r_ << std::endl;
740
+ os << " Current size : " << curr_items_alloc_ << std::endl;
741
+ os << " Resize factor: " << (1 << rf_) << std::endl;
742
+ os << "### END SKETCH SUMMARY" << std::endl;
743
+ return os.str();
744
+ }
745
+
746
+ template<typename T, typename S, typename A>
747
+ string<A> var_opt_sketch<T,S,A>::items_to_string() const {
748
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
749
+ os << "### Sketch Items" << std::endl;
750
+ int idx = 0;
751
+ for (auto record : *this) {
752
+ os << idx << ": " << record.first << "\twt = " << record.second << std::endl;
753
+ ++idx;
754
+ }
755
+ return os.str();
756
+ }
757
+
758
+ template<typename T, typename S, typename A>
759
+ string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
760
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
761
+ os << "### Sketch Items" << std::endl;
762
+ const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
763
+ for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
764
+ if (i == h_ && print_gap) {
765
+ os << i << ": GAP" << std::endl;
766
+ ++display_idx;
767
+ } else {
768
+ os << i << ": " << data_[i] << "\twt = ";
769
+ if (weights_[i] == -1.0) {
770
+ os << get_tau() << "\t(-1.0)" << std::endl;
771
+ } else {
772
+ os << weights_[i] << std::endl;
773
+ }
774
+ ++display_idx;
775
+ }
776
+ }
777
+ return os.str();
778
+ }
779
+
780
+ template<typename T, typename S, typename A>
781
+ template<typename O>
782
+ void var_opt_sketch<T,S,A>::update(O&& item, double weight, bool mark) {
783
+ if (weight < 0.0 || std::isnan(weight) || std::isinf(weight)) {
784
+ throw std::invalid_argument("Item weights must be nonnegative and finite. Found: "
785
+ + std::to_string(weight));
786
+ } else if (weight == 0.0) {
787
+ return;
788
+ }
789
+ ++n_;
790
+
791
+ if (r_ == 0) {
792
+ // exact mode
793
+ update_warmup_phase(std::forward<O>(item), weight, mark);
794
+ } else {
795
+ // sketch is in estimation mode so we can make the following check,
796
+ // although very conservative to check every time
797
+ if ((h_ != 0) && (peek_min() < get_tau()))
798
+ throw std::logic_error("sketch not in valid estimation mode");
799
+
800
+ // what tau would be if deletion candidates turn out to be R plus the new item
801
+ // note: (r_ + 1) - 1 is intentional
802
+ const double hypothetical_tau = (weight + total_wt_r_) / ((r_ + 1) - 1);
803
+
804
+ // is new item's turn to be considered for reservoir?
805
+ const double condition1 = (h_ == 0) || (weight <= peek_min());
806
+
807
+ // is new item light enough for reservoir?
808
+ const double condition2 = weight < hypothetical_tau;
809
+
810
+ if (condition1 && condition2) {
811
+ update_light(std::forward<O>(item), weight, mark);
812
+ } else if (r_ == 1) {
813
+ update_heavy_r_eq1(std::forward<O>(item), weight, mark);
814
+ } else {
815
+ update_heavy_general(std::forward<O>(item), weight, mark);
816
+ }
817
+ }
818
+ }
819
+
820
+ template<typename T, typename S, typename A>
821
+ template<typename O>
822
+ void var_opt_sketch<T,S,A>::update_warmup_phase(O&& item, double weight, bool mark) {
823
+ // seems overly cautious
824
+ if (r_ > 0 || m_ != 0 || h_ > k_) throw std::logic_error("invalid sketch state during warmup");
825
+
826
+ if (h_ >= curr_items_alloc_) {
827
+ grow_data_arrays();
828
+ }
829
+
830
+ // store items as they come in until full
831
+ new (&data_[h_]) T(std::forward<O>(item));
832
+ weights_[h_] = weight;
833
+ if (marks_ != nullptr) {
834
+ marks_[h_] = mark;
835
+ }
836
+ ++h_;
837
+ num_marks_in_h_ += mark ? 1 : 0;
838
+
839
+ // check if need to heapify
840
+ if (h_ > k_) {
841
+ filled_data_ = true;
842
+ transition_from_warmup();
843
+ }
844
+ }
845
+
846
+ /* In the "light" case the new item has weight <= old_tau, so
847
+ would appear to the right of the R items in a hypothetical reverse-sorted
848
+ list. It is easy to prove that it is light enough to be part of this
849
+ round's downsampling */
850
+ template<typename T, typename S, typename A>
851
+ template<typename O>
852
+ void var_opt_sketch<T,S,A>::update_light(O&& item, double weight, bool mark) {
853
+ if (r_ == 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during light warmup");
854
+
855
+ const uint32_t m_slot = h_; // index of the gap, which becomes the M region
856
+ if (filled_data_) {
857
+ data_[m_slot] = std::forward<O>(item);
858
+ } else {
859
+ new (&data_[m_slot]) T(std::forward<O>(item));
860
+ filled_data_ = true;
861
+ }
862
+ weights_[m_slot] = weight;
863
+ if (marks_ != nullptr) { marks_[m_slot] = mark; }
864
+ ++m_;
865
+
866
+ grow_candidate_set(total_wt_r_ + weight, r_ + 1);
867
+ }
868
+
869
+ /* In the "heavy" case the new item has weight > old_tau, so would
870
+ appear to the left of items in R in a hypothetical reverse-sorted list and
871
+ might or might not be light enough be part of this round's downsampling.
872
+ [After first splitting off the R=1 case] we greatly simplify the code by
873
+ putting the new item into the H heap whether it needs to be there or not.
874
+ In other words, it might go into the heap and then come right back out,
875
+ but that should be okay because pseudo_heavy items cannot predominate
876
+ in long streams unless (max wt) / (min wt) > o(exp(N)) */
877
+ template<typename T, typename S, typename A>
878
+ template<typename O>
879
+ void var_opt_sketch<T,S,A>::update_heavy_general(O&& item, double weight, bool mark) {
880
+ if (r_ < 2 || m_ != 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during heavy general update");
881
+
882
+ // put into H, although may come back out momentarily
883
+ push(std::forward<O>(item), weight, mark);
884
+
885
+ grow_candidate_set(total_wt_r_, r_);
886
+ }
887
+
888
+ /* The analysis of this case is similar to that of the general heavy case.
889
+ The one small technical difference is that since R < 2, we must grab an M item
890
+ to have a valid starting point for continue_by_growing_candidate_set () */
891
+ template<typename T, typename S, typename A>
892
+ template<typename O>
893
+ void var_opt_sketch<T,S,A>::update_heavy_r_eq1(O&& item, double weight, bool mark) {
894
+ if (r_ != 1 || m_ != 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during heavy r=1 update");
895
+
896
+ push(std::forward<O>(item), weight, mark); // new item into H
897
+ pop_min_to_m_region(); // pop lightest back into M
898
+
899
+ // Any set of two items is downsample-able to one item,
900
+ // so the two lightest items are a valid starting point for the following
901
+ const uint32_t m_slot = k_ - 1; // array is k+1, 1 in R, so slot before is M
902
+ grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
903
+ }
904
+
905
+ /**
906
+ * Decreases sketch's value of k by 1, updating stored values as needed.
907
+ *
908
+ * <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by
909
+ * the unioning algorithm to force "marked" items out of H and into the reservoir region.</p>
910
+ */
911
+ template<typename T, typename S, typename A>
912
+ void var_opt_sketch<T,S,A>::decrease_k_by_1() {
913
+ if (k_ <= 1) {
914
+ throw std::logic_error("Cannot decrease k below 1 in union");
915
+ }
916
+
917
+ if ((h_ == 0) && (r_ == 0)) {
918
+ // exact mode, but no data yet; this reduction is somewhat gratuitous
919
+ --k_;
920
+ } else if ((h_ > 0) && (r_ == 0)) {
921
+ // exact mode, but we have some data
922
+ --k_;
923
+ if (h_ > k_) {
924
+ transition_from_warmup();
925
+ }
926
+ } else if ((h_ > 0) && (r_ > 0)) {
927
+ // reservoir mode, but we have some exact samples.
928
+ // Our strategy will be to pull an item out of H (which we are allowed to do since it's
929
+ // still just data), reduce k, and then re-insert the item
930
+
931
+ // first, slide the R zone to the left by 1, temporarily filling the gap
932
+ const uint32_t old_gap_idx = h_;
933
+ const uint32_t old_final_r_idx = (h_ + 1 + r_) - 1;
934
+ //if (old_final_r_idx != k_) throw std::logic_error("gadget in invalid state");
935
+
936
+ swap_values(old_final_r_idx, old_gap_idx);
937
+
938
+ // now we pull an item out of H; any item is ok, but if we grab the rightmost and then
939
+ // reduce h_, the heap invariant will be preserved (and the gap will be restored), plus
940
+ // the push() of the item that will probably happen later will be cheap.
941
+
942
+ const uint32_t pulled_idx = h_ - 1;
943
+ double pulled_weight = weights_[pulled_idx];
944
+ bool pulled_mark = marks_[pulled_idx];
945
+ // will move the pulled item below; don't do antying to it here
946
+
947
+ if (pulled_mark) { --num_marks_in_h_; }
948
+ weights_[pulled_idx] = -1.0; // to make bugs easier to spot
949
+
950
+ --h_;
951
+ --k_;
952
+ --n_; // will be re-incremented with the update
953
+
954
+ update(std::move(data_[pulled_idx]), pulled_weight, pulled_mark);
955
+ } else if ((h_ == 0) && (r_ > 0)) {
956
+ // pure reservoir mode, so can simply eject a randomly chosen sample from the reservoir
957
+ if (r_ < 2) throw std::logic_error("r_ too small for pure reservoir mode");
958
+
959
+ const uint32_t r_idx_to_delete = 1 + next_int(r_); // 1 for the gap
960
+ const uint32_t rightmost_r_idx = (1 + r_) - 1;
961
+ swap_values(r_idx_to_delete, rightmost_r_idx);
962
+ weights_[rightmost_r_idx] = -1.0;
963
+
964
+ --k_;
965
+ --r_;
966
+ }
967
+ }
968
+
969
+ template<typename T, typename S, typename A>
970
+ void var_opt_sketch<T,S,A>::allocate_data_arrays(uint32_t tgt_size, bool use_marks) {
971
+ filled_data_ = false;
972
+
973
+ data_ = A().allocate(tgt_size);
974
+ weights_ = AllocDouble().allocate(tgt_size);
975
+
976
+ if (use_marks) {
977
+ marks_ = AllocBool().allocate(tgt_size);
978
+ } else {
979
+ marks_ = nullptr;
980
+ }
981
+ }
982
+
983
+ template<typename T, typename S, typename A>
984
+ void var_opt_sketch<T,S,A>::grow_data_arrays() {
985
+ const uint32_t prev_size = curr_items_alloc_;
986
+ curr_items_alloc_ = get_adjusted_size(k_, curr_items_alloc_ << rf_);
987
+ if (curr_items_alloc_ == k_) {
988
+ ++curr_items_alloc_;
989
+ }
990
+
991
+ if (prev_size < curr_items_alloc_) {
992
+ filled_data_ = false;
993
+
994
+ T* tmp_data = A().allocate(curr_items_alloc_);
995
+ double* tmp_weights = AllocDouble().allocate(curr_items_alloc_);
996
+
997
+ for (uint32_t i = 0; i < prev_size; ++i) {
998
+ new (&tmp_data[i]) T(std::move(data_[i]));
999
+ A().destroy(data_ + i);
1000
+ tmp_weights[i] = weights_[i];
1001
+ }
1002
+
1003
+ A().deallocate(data_, prev_size);
1004
+ AllocDouble().deallocate(weights_, prev_size);
1005
+
1006
+ data_ = tmp_data;
1007
+ weights_ = tmp_weights;
1008
+
1009
+ if (marks_ != nullptr) {
1010
+ bool* tmp_marks = AllocBool().allocate(curr_items_alloc_);
1011
+ for (uint32_t i = 0; i < prev_size; ++i) {
1012
+ tmp_marks[i] = marks_[i];
1013
+ }
1014
+ AllocBool().deallocate(marks_, prev_size);
1015
+ marks_ = tmp_marks;
1016
+ }
1017
+ }
1018
+ }
1019
+
1020
+ template<typename T, typename S, typename A>
1021
+ void var_opt_sketch<T,S,A>::transition_from_warmup() {
1022
+ // Move the 2 lightest items from H to M
1023
+ // But the lighter really belongs in R, so update counts to reflect that
1024
+ convert_to_heap();
1025
+ pop_min_to_m_region();
1026
+ pop_min_to_m_region();
1027
+ --m_;
1028
+ ++r_;
1029
+
1030
+ if (h_ != (k_ -1) || m_ != 1 || r_ != 1)
1031
+ throw std::logic_error("invalid state for transitioning from warmup");
1032
+
1033
+ // Update total weight in R and then, having grabbed the value, overwrite
1034
+ // in weight_ array to help make bugs more obvious
1035
+ total_wt_r_ = weights_[k_]; // only one item, known location
1036
+ weights_[k_] = -1.0;
1037
+
1038
+ // The two lightest items are ncessarily downsample-able to one item,
1039
+ // and are therefore a valid initial candidate set
1040
+ grow_candidate_set(weights_[k_ - 1] + total_wt_r_, 2);
1041
+ }
1042
+
1043
+ template<typename T, typename S, typename A>
1044
+ void var_opt_sketch<T,S,A>::convert_to_heap() {
1045
+ if (h_ < 2) {
1046
+ return; // nothing to do
1047
+ }
1048
+
1049
+ const uint32_t last_slot = h_ - 1;
1050
+ const int last_non_leaf = ((last_slot + 1) / 2) - 1;
1051
+
1052
+ for (int j = last_non_leaf; j >= 0; --j) {
1053
+ restore_towards_leaves(j);
1054
+ }
1055
+
1056
+ // validates heap, used for initial debugging
1057
+ //for (uint32_t j = h_ - 1; j >= 1; --j) {
1058
+ // uint32_t p = ((j + 1) / 2) - 1;
1059
+ // if (weights_[p] > weights_[j]) throw std::logic_error("invalid heap");
1060
+ //}
1061
+ }
1062
+
1063
+ template<typename T, typename S, typename A>
1064
+ void var_opt_sketch<T,S,A>::restore_towards_leaves(uint32_t slot_in) {
1065
+ const uint32_t last_slot = h_ - 1;
1066
+ if (h_ == 0 || slot_in > last_slot) throw std::logic_error("invalid heap state");
1067
+
1068
+ uint32_t slot = slot_in;
1069
+ uint32_t child = (2 * slot_in) + 1; // might be invalid, need to check
1070
+
1071
+ while (child <= last_slot) {
1072
+ uint32_t child2 = child + 1; // might also be invalid
1073
+ if ((child2 <= last_slot) && (weights_[child2] < weights_[child])) {
1074
+ // siwtch to other child if it's both valid and smaller
1075
+ child = child2;
1076
+ }
1077
+
1078
+ if (weights_[slot] <= weights_[child]) {
1079
+ // invariant holds so we're done
1080
+ break;
1081
+ }
1082
+
1083
+ // swap and continue
1084
+ swap_values(slot, child);
1085
+
1086
+ slot = child;
1087
+ child = (2 * slot) + 1; // might be invalid, checked on next loop
1088
+ }
1089
+ }
1090
+
1091
+ template<typename T, typename S, typename A>
1092
+ void var_opt_sketch<T,S,A>::restore_towards_root(uint32_t slot_in) {
1093
+ uint32_t slot = slot_in;
1094
+ uint32_t p = (((slot + 1) / 2) - 1); // valid if slot >= 1
1095
+ while ((slot > 0) && (weights_[slot] < weights_[p])) {
1096
+ swap_values(slot, p);
1097
+ slot = p;
1098
+ p = (((slot + 1) / 2) - 1); // valid if slot >= 1
1099
+ }
1100
+ }
1101
+
1102
+ template<typename T, typename S, typename A>
1103
+ template<typename O>
1104
+ void var_opt_sketch<T,S,A>::push(O&& item, double wt, bool mark) {
1105
+ if (filled_data_) {
1106
+ data_[h_] = std::forward<O>(item);
1107
+ } else {
1108
+ new (&data_[h_]) T(std::forward<O>(item));
1109
+ filled_data_ = true;
1110
+ }
1111
+ weights_[h_] = wt;
1112
+ if (marks_ != nullptr) {
1113
+ marks_[h_] = mark;
1114
+ num_marks_in_h_ += (mark ? 1 : 0);
1115
+ }
1116
+ ++h_;
1117
+
1118
+ restore_towards_root(h_ - 1); // need use old h_, but want accurate h_
1119
+ }
1120
+
1121
+ template<typename T, typename S, typename A>
1122
+ void var_opt_sketch<T,S,A>::pop_min_to_m_region() {
1123
+ if (h_ == 0 || (h_ + m_ + r_ != k_ + 1))
1124
+ throw std::logic_error("invalid heap state popping min to M region");
1125
+
1126
+ if (h_ == 1) {
1127
+ // just update bookkeeping
1128
+ ++m_;
1129
+ --h_;
1130
+ } else {
1131
+ // main case
1132
+ uint32_t tgt = h_ - 1; // last slot, will swap with root
1133
+ swap_values(0, tgt);
1134
+ ++m_;
1135
+ --h_;
1136
+
1137
+ restore_towards_leaves(0);
1138
+ }
1139
+
1140
+ if (is_marked(h_)) {
1141
+ --num_marks_in_h_;
1142
+ }
1143
+ }
1144
+
1145
+
1146
+ template<typename T, typename S, typename A>
1147
+ void var_opt_sketch<T,S,A>::swap_values(uint32_t src, uint32_t dst) {
1148
+ std::swap(data_[src], data_[dst]);
1149
+ std::swap(weights_[src], weights_[dst]);
1150
+
1151
+ if (marks_ != nullptr) {
1152
+ std::swap(marks_[src], marks_[dst]);
1153
+ }
1154
+ }
1155
+
1156
+ /* When entering here we should be in a well-characterized state where the
1157
+ new item has been placed in either h or m and we have a valid but not necessarily
1158
+ maximal sampling plan figured out. The array is completely full at this point.
1159
+ Everyone in h and m has an explicit weight. The candidates are right-justified
1160
+ and are either just the r set or the r set + exactly one m item. The number
1161
+ of cands is at least 2. We will now grow the candidate set as much as possible
1162
+ by pulling sufficiently light items from h to m.
1163
+ */
1164
+ template<typename T, typename S, typename A>
1165
+ void var_opt_sketch<T,S,A>::grow_candidate_set(double wt_cands, uint32_t num_cands) {
1166
+ if ((h_ + m_ + r_ != k_ + 1) || (num_cands < 1) || (num_cands != m_ + r_) || (m_ >= 2))
1167
+ throw std::logic_error("invariant violated when growing candidate set");
1168
+
1169
+ while (h_ > 0) {
1170
+ const double next_wt = peek_min();
1171
+ const double next_tot_wt = wt_cands + next_wt;
1172
+
1173
+ // test for strict lightness of next prospect (denominator multiplied through)
1174
+ // ideally: (next_wt * (next_num_cands-1) < next_tot_wt)
1175
+ // but can use num_cands directly
1176
+ if ((next_wt * num_cands) < next_tot_wt) {
1177
+ wt_cands = next_tot_wt;
1178
+ ++num_cands;
1179
+ pop_min_to_m_region(); // adjusts h_ and m_
1180
+ } else {
1181
+ break;
1182
+ }
1183
+ }
1184
+
1185
+ downsample_candidate_set(wt_cands, num_cands);
1186
+ }
1187
+
1188
+ template<typename T, typename S, typename A>
1189
+ void var_opt_sketch<T,S,A>::downsample_candidate_set(double wt_cands, uint32_t num_cands) {
1190
+ if (num_cands < 2 || h_ + num_cands != k_ + 1)
1191
+ throw std::logic_error("invalid num_cands when downsampling");
1192
+
1193
+ // need this before overwriting anything
1194
+ const uint32_t delete_slot = choose_delete_slot(wt_cands, num_cands);
1195
+ const uint32_t leftmost_cand_slot = h_;
1196
+ if (delete_slot < leftmost_cand_slot || delete_slot > k_)
1197
+ throw std::logic_error("invalid delete slot index when downsampling");
1198
+
1199
+ // Overwrite weights for items from M moving into R,
1200
+ // to make bugs more obvious. Also needed so anyone reading the
1201
+ // weight knows if it's invalid without checking h_ and m_
1202
+ const uint32_t stop_idx = leftmost_cand_slot + m_;
1203
+ for (uint32_t j = leftmost_cand_slot; j < stop_idx; ++j) {
1204
+ weights_[j] = -1.0;
1205
+ }
1206
+
1207
+ // The next two lines work even when delete_slot == leftmost_cand_slot
1208
+ data_[delete_slot] = std::move(data_[leftmost_cand_slot]);
1209
+ // cannot set data_[leftmost_cand_slot] to null since not uisng T*
1210
+
1211
+ m_ = 0;
1212
+ r_ = num_cands - 1;
1213
+ total_wt_r_ = wt_cands;
1214
+ }
1215
+
1216
+ template<typename T, typename S, typename A>
1217
+ uint32_t var_opt_sketch<T,S,A>::choose_delete_slot(double wt_cands, uint32_t num_cands) const {
1218
+ if (r_ == 0) throw std::logic_error("choosing delete slot while in exact mode");
1219
+
1220
+ if (m_ == 0) {
1221
+ // this happens if we insert a really heavy item
1222
+ return pick_random_slot_in_r();
1223
+ } else if (m_ == 1) {
1224
+ // check if we keep th item in M or pick oen from R
1225
+ // p(keep) = (num_cand - 1) * wt_M / wt_cand
1226
+ double wt_m_cand = weights_[h_]; // slot of item in M is h_
1227
+ if ((wt_cands * next_double_exclude_zero()) < ((num_cands - 1) * wt_m_cand)) {
1228
+ return pick_random_slot_in_r(); // keep item in M
1229
+ } else {
1230
+ return h_; // indext of item in M
1231
+ }
1232
+ } else {
1233
+ // general case
1234
+ const uint32_t delete_slot = choose_weighted_delete_slot(wt_cands, num_cands);
1235
+ const uint32_t first_r_slot = h_ + m_;
1236
+ if (delete_slot == first_r_slot) {
1237
+ return pick_random_slot_in_r();
1238
+ } else {
1239
+ return delete_slot;
1240
+ }
1241
+ }
1242
+ }
1243
+
1244
+ template<typename T, typename S, typename A>
1245
+ uint32_t var_opt_sketch<T,S,A>::choose_weighted_delete_slot(double wt_cands, uint32_t num_cands) const {
1246
+ if (m_ < 1) throw std::logic_error("must have weighted delete slot");
1247
+
1248
+ const uint32_t offset = h_;
1249
+ const uint32_t final_m = (offset + m_) - 1;
1250
+ const uint32_t num_to_keep = num_cands - 1;
1251
+
1252
+ double left_subtotal = 0.0;
1253
+ double right_subtotal = -1.0 * wt_cands * next_double_exclude_zero();
1254
+
1255
+ for (uint32_t i = offset; i <= final_m; ++i) {
1256
+ left_subtotal += num_to_keep * weights_[i];
1257
+ right_subtotal += wt_cands;
1258
+
1259
+ if (left_subtotal < right_subtotal) {
1260
+ return i;
1261
+ }
1262
+ }
1263
+
1264
+ // this slot tells caller that we need to delete out of R
1265
+ return final_m + 1;
1266
+ }
1267
+
1268
+ template<typename T, typename S, typename A>
1269
+ uint32_t var_opt_sketch<T,S,A>::pick_random_slot_in_r() const {
1270
+ if (r_ == 0) throw std::logic_error("r_ = 0 when picking slot in R region");
1271
+ const uint32_t offset = h_ + m_;
1272
+ if (r_ == 1) {
1273
+ return offset;
1274
+ } else {
1275
+ return offset + next_int(r_);
1276
+ }
1277
+ }
1278
+
1279
+ template<typename T, typename S, typename A>
1280
+ double var_opt_sketch<T,S,A>::peek_min() const {
1281
+ if (h_ == 0) throw std::logic_error("h_ = 0 when checking min in H region");
1282
+ return weights_[0];
1283
+ }
1284
+
1285
+ template<typename T, typename S, typename A>
1286
+ inline bool var_opt_sketch<T,S,A>::is_marked(uint32_t idx) const {
1287
+ return marks_ == nullptr ? false : marks_[idx];
1288
+ }
1289
+
1290
+ template<typename T, typename S, typename A>
1291
+ double var_opt_sketch<T,S,A>::get_tau() const {
1292
+ return r_ == 0 ? std::nan("1") : (total_wt_r_ / r_);
1293
+ }
1294
+
1295
+ template<typename T, typename S, typename A>
1296
+ void var_opt_sketch<T,S,A>::strip_marks() {
1297
+ if (marks_ == nullptr) throw std::logic_error("request to strip marks from non-gadget");
1298
+ num_marks_in_h_ = 0;
1299
+ AllocBool().deallocate(marks_, curr_items_alloc_);
1300
+ marks_ = nullptr;
1301
+ }
1302
+
1303
+ template<typename T, typename S, typename A>
1304
+ void var_opt_sketch<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
1305
+ const bool is_empty(flags & EMPTY_FLAG_MASK);
1306
+
1307
+ if (is_empty) {
1308
+ if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
1309
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
1310
+ + std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
1311
+ + std::to_string(preamble_longs));
1312
+ }
1313
+ } else {
1314
+ if (preamble_longs != PREAMBLE_LONGS_WARMUP
1315
+ && preamble_longs != PREAMBLE_LONGS_FULL) {
1316
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
1317
+ + std::to_string(PREAMBLE_LONGS_WARMUP) + " or "
1318
+ + std::to_string(PREAMBLE_LONGS_FULL)
1319
+ + " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
1320
+ }
1321
+ }
1322
+ }
1323
+
1324
+ template<typename T, typename S, typename A>
1325
+ void var_opt_sketch<T,S,A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
1326
+ if (family_id == FAMILY_ID) {
1327
+ if (ser_ver != SER_VER) {
1328
+ throw std::invalid_argument("Possible corruption: VarOpt serialization version must be "
1329
+ + std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));
1330
+ }
1331
+ return;
1332
+ }
1333
+ // TODO: extend to handle reservoir sampling
1334
+
1335
+ throw std::invalid_argument("Possible corruption: VarOpt family id must be "
1336
+ + std::to_string(FAMILY_ID) + ". Found: " + std::to_string(family_id));
1337
+ }
1338
+
1339
+ template<typename T, typename S, typename A>
1340
+ uint32_t var_opt_sketch<T, S, A>::validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
1341
+ uint32_t h, uint32_t r, resize_factor rf) {
1342
+ if (k == 0 || k > MAX_K) {
1343
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
1344
+ }
1345
+
1346
+ uint32_t array_size;
1347
+
1348
+ if (n <= k) {
1349
+ if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
1350
+ throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
1351
+ "Found n = " + std::to_string(n) + ", k = " + std::to_string(k));
1352
+ }
1353
+ if (n != h) {
1354
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
1355
+ "Found n = " + std::to_string(n) + ", h = " + std::to_string(h));
1356
+ }
1357
+ if (r > 0) {
1358
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
1359
+ "Found r = " + std::to_string(r));
1360
+ }
1361
+
1362
+ const uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k));
1363
+ const uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h));
1364
+ const uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf, min_lg_size);
1365
+ array_size = get_adjusted_size(k, 1 << initial_lg_size);
1366
+ if (array_size == k) { // if full size, need to leave 1 for the gap
1367
+ ++array_size;
1368
+ }
1369
+ } else { // n > k
1370
+ if (preamble_longs != PREAMBLE_LONGS_FULL) {
1371
+ throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
1372
+ "Found n = " + std::to_string(n) + ", k = " + std::to_string(k));
1373
+ }
1374
+ if (h + r != k) {
1375
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
1376
+ "Found h = " + std::to_string(h) + ", r = " + std::to_string(r) + ", n = " + std::to_string(n));
1377
+ }
1378
+
1379
+ array_size = k + 1;
1380
+ }
1381
+
1382
+ return array_size;
1383
+ }
1384
+
1385
+ template<typename T, typename S, typename A>
1386
+ template<typename P>
1387
+ subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(P predicate) const {
1388
+ if (n_ == 0) {
1389
+ return {0.0, 0.0, 0.0, 0.0};
1390
+ }
1391
+
1392
+ double total_wt_h = 0.0;
1393
+ double h_true_wt = 0.0;
1394
+ size_t idx = 0;
1395
+ for (; idx < h_; ++idx) {
1396
+ double wt = weights_[idx];
1397
+ total_wt_h += wt;
1398
+ if (predicate(data_[idx])) {
1399
+ h_true_wt += wt;
1400
+ }
1401
+ }
1402
+
1403
+ // if only heavy items, we have an exact answer
1404
+ if (r_ == 0) {
1405
+ return {h_true_wt, h_true_wt, h_true_wt, h_true_wt};
1406
+ }
1407
+
1408
+ // since r_ > 0, we know we have samples
1409
+ const uint64_t num_samples = n_ - h_;
1410
+ double effective_sampling_rate = r_ / static_cast<double>(num_samples);
1411
+ if (effective_sampling_rate < 0.0 || effective_sampling_rate > 1.0)
1412
+ throw std::logic_error("invalid sampling rate outside [0.0, 1.0]");
1413
+
1414
+ size_t r_true_count = 0;
1415
+ ++idx; // skip the gap
1416
+ for (; idx < (k_ + 1); ++idx) {
1417
+ if (predicate(data_[idx])) {
1418
+ ++r_true_count;
1419
+ }
1420
+ }
1421
+
1422
+ double lb_true_fraction = pseudo_hypergeometric_lb_on_p(r_, r_true_count, effective_sampling_rate);
1423
+ double estimated_true_fraction = (1.0 * r_true_count) / r_;
1424
+ double ub_true_fraction = pseudo_hypergeometric_ub_on_p(r_, r_true_count, effective_sampling_rate);
1425
+
1426
+ return { h_true_wt + (total_wt_r_ * lb_true_fraction),
1427
+ h_true_wt + (total_wt_r_ * estimated_true_fraction),
1428
+ h_true_wt + (total_wt_r_ * ub_true_fraction),
1429
+ total_wt_h + total_wt_r_
1430
+ };
1431
+ }
1432
+
1433
+ template<typename T, typename S, typename A>
1434
+ class var_opt_sketch<T, S, A>::items_deleter {
1435
+ public:
1436
+ items_deleter(uint32_t num) : num(num), h_count(0), r_count(0) {}
1437
+ void set_h(uint32_t h) { h_count = h; }
1438
+ void set_r(uint32_t r) { r_count = r; }
1439
+ void operator() (T* ptr) const {
1440
+ if (h_count > 0) {
1441
+ for (size_t i = 0; i < h_count; ++i) {
1442
+ ptr[i].~T();
1443
+ }
1444
+ }
1445
+ if (r_count > 0) {
1446
+ uint32_t end = h_count + r_count + 1;
1447
+ for (size_t i = h_count + 1; i < end; ++i) {
1448
+ ptr[i].~T();
1449
+ }
1450
+ }
1451
+ if (ptr != nullptr) {
1452
+ A().deallocate(ptr, num);
1453
+ }
1454
+ }
1455
+ private:
1456
+ uint32_t num;
1457
+ uint32_t h_count;
1458
+ uint32_t r_count;
1459
+ };
1460
+
1461
+ template<typename T, typename S, typename A>
1462
+ class var_opt_sketch<T, S, A>::weights_deleter {
1463
+ public:
1464
+ weights_deleter(uint32_t num) : num(num) {}
1465
+ void operator() (double* ptr) const {
1466
+ if (ptr != nullptr) {
1467
+ AllocDouble().deallocate(ptr, num);
1468
+ }
1469
+ }
1470
+ private:
1471
+ uint32_t num;
1472
+ };
1473
+
1474
+ template<typename T, typename S, typename A>
1475
+ class var_opt_sketch<T, S, A>::marks_deleter {
1476
+ public:
1477
+ marks_deleter(uint32_t num) : num(num) {}
1478
+ void operator() (bool* ptr) const {
1479
+ if (ptr != nullptr) {
1480
+ AllocBool().deallocate(ptr, 1);
1481
+ }
1482
+ }
1483
+ private:
1484
+ uint32_t num;
1485
+ };
1486
+
1487
+
1488
+ template<typename T, typename S, typename A>
1489
+ typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::begin() const {
1490
+ return var_opt_sketch<T, S, A>::const_iterator(*this, false);
1491
+ }
1492
+
1493
+ template<typename T, typename S, typename A>
1494
+ typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::end() const {
1495
+ return var_opt_sketch<T, S, A>::const_iterator(*this, true);
1496
+ }
1497
+
1498
+ // -------- var_opt_sketch::const_iterator implementation ---------
1499
+
1500
+ template<typename T, typename S, typename A>
1501
+ var_opt_sketch<T,S,A>::const_iterator::const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end) :
1502
+ sk_(&sk),
1503
+ cum_r_weight_(0.0),
1504
+ r_item_wt_(sk.get_tau()),
1505
+ final_idx_(sk.r_ > 0 ? sk.h_ + sk.r_ + 1 : sk.h_)
1506
+ {
1507
+ // index logic easier to read if not inline
1508
+ if (is_end) {
1509
+ idx_ = final_idx_;
1510
+ sk_ = nullptr;
1511
+ } else {
1512
+ idx_ = (sk.h_ == 0 && sk.r_ > 0 ? 1 : 0); // skip if gap is at start
1513
+ }
1514
+
1515
+ // should only apply if sketch is empty
1516
+ if (idx_ == final_idx_) { sk_ = nullptr; }
1517
+ }
1518
+
1519
+ template<typename T, typename S, typename A>
1520
+ var_opt_sketch<T,S,A>::const_iterator::const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region) :
1521
+ sk_(&sk),
1522
+ cum_r_weight_(0.0),
1523
+ r_item_wt_(sk.get_tau()),
1524
+ final_idx_(sk.h_ + (use_r_region ? 1 + sk.r_ : 0))
1525
+ {
1526
+ if (use_r_region) {
1527
+ idx_ = sk.h_ + 1 + (is_end ? sk.r_ : 0);
1528
+ } else { // H region
1529
+ // gap at start only if h_ == 0, so index always starts at 0
1530
+ idx_ = (is_end ? sk.h_ : 0);
1531
+ }
1532
+
1533
+ // unlike in full iterator case, may happen even if sketch is not empty
1534
+ if (idx_ == final_idx_) { sk_ = nullptr; }
1535
+ }
1536
+
1537
+
1538
+ template<typename T, typename S, typename A>
1539
+ var_opt_sketch<T, S, A>::const_iterator::const_iterator(const const_iterator& other) :
1540
+ sk_(other.sk_),
1541
+ cum_r_weight_(other.cum_r_weight_),
1542
+ r_item_wt_(other.r_item_wt_),
1543
+ idx_(other.idx_),
1544
+ final_idx_(other.final_idx_)
1545
+ {}
1546
+
1547
+ template<typename T, typename S, typename A>
1548
+ typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++() {
1549
+ ++idx_;
1550
+
1551
+ if (idx_ == final_idx_) {
1552
+ sk_ = nullptr;
1553
+ return *this;
1554
+ } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
1555
+ ++idx_;
1556
+ }
1557
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1558
+ return *this;
1559
+ }
1560
+
1561
+ template<typename T, typename S, typename A>
1562
+ typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++(int) {
1563
+ const_iterator tmp(*this);
1564
+ operator++();
1565
+ return tmp;
1566
+ }
1567
+
1568
+ template<typename T, typename S, typename A>
1569
+ bool var_opt_sketch<T, S, A>::const_iterator::operator==(const const_iterator& other) const {
1570
+ if (sk_ != other.sk_) return false;
1571
+ if (sk_ == nullptr) return true; // end (and we know other.sk_ is also null)
1572
+ return idx_ == other.idx_;
1573
+ }
1574
+
1575
+ template<typename T, typename S, typename A>
1576
+ bool var_opt_sketch<T, S, A>::const_iterator::operator!=(const const_iterator& other) const {
1577
+ return !operator==(other);
1578
+ }
1579
+
1580
+ template<typename T, typename S, typename A>
1581
+ const std::pair<const T&, const double> var_opt_sketch<T, S, A>::const_iterator::operator*() const {
1582
+ double wt;
1583
+ if (idx_ < sk_->h_) {
1584
+ wt = sk_->weights_[idx_];
1585
+ } else {
1586
+ wt = r_item_wt_;
1587
+ }
1588
+ return std::pair<const T&, const double>(sk_->data_[idx_], wt);
1589
+ }
1590
+
1591
+ template<typename T, typename S, typename A>
1592
+ bool var_opt_sketch<T, S, A>::const_iterator::get_mark() const {
1593
+ return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
1594
+ }
1595
+
1596
+
1597
+ // -------- var_opt_sketch::iterator implementation ---------
1598
+
1599
+ template<typename T, typename S, typename A>
1600
+ var_opt_sketch<T,S,A>::iterator::iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region) :
1601
+ sk_(&sk),
1602
+ cum_r_weight_(0.0),
1603
+ r_item_wt_(sk.get_tau()),
1604
+ final_idx_(sk.h_ + (use_r_region ? 1 + sk.r_ : 0))
1605
+ {
1606
+ if (use_r_region) {
1607
+ idx_ = sk.h_ + 1 + (is_end ? sk.r_ : 0);
1608
+ } else { // H region
1609
+ // gap at start only if h_ == 0, so index always starts at 0
1610
+ idx_ = (is_end ? sk.h_ : 0);
1611
+ }
1612
+
1613
+ // unlike in full iterator case, may happen even if sketch is not empty
1614
+ if (idx_ == final_idx_) { sk_ = nullptr; }
1615
+ }
1616
+
1617
+ template<typename T, typename S, typename A>
1618
+ var_opt_sketch<T, S, A>::iterator::iterator(const iterator& other) :
1619
+ sk_(other.sk_),
1620
+ cum_r_weight_(other.cum_r_weight_),
1621
+ r_item_wt_(other.r_item_wt_),
1622
+ idx_(other.idx_),
1623
+ final_idx_(other.final_idx_)
1624
+ {}
1625
+
1626
+ template<typename T, typename S, typename A>
1627
+ typename var_opt_sketch<T, S, A>::iterator& var_opt_sketch<T, S, A>::iterator::operator++() {
1628
+ ++idx_;
1629
+
1630
+ if (idx_ == final_idx_) {
1631
+ sk_ = nullptr;
1632
+ return *this;
1633
+ } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
1634
+ ++idx_;
1635
+ }
1636
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1637
+ return *this;
1638
+ }
1639
+
1640
+ template<typename T, typename S, typename A>
1641
+ typename var_opt_sketch<T, S, A>::iterator& var_opt_sketch<T, S, A>::iterator::operator++(int) {
1642
+ const_iterator tmp(*this);
1643
+ operator++();
1644
+ return tmp;
1645
+ }
1646
+
1647
+ template<typename T, typename S, typename A>
1648
+ bool var_opt_sketch<T, S, A>::iterator::operator==(const iterator& other) const {
1649
+ if (sk_ != other.sk_) return false;
1650
+ if (sk_ == nullptr) return true; // end (and we know other.sk_ is also null)
1651
+ return idx_ == other.idx_;
1652
+ }
1653
+
1654
+ template<typename T, typename S, typename A>
1655
+ bool var_opt_sketch<T, S, A>::iterator::operator!=(const iterator& other) const {
1656
+ return !operator==(other);
1657
+ }
1658
+
1659
+ template<typename T, typename S, typename A>
1660
+ std::pair<T&, double> var_opt_sketch<T, S, A>::iterator::operator*() {
1661
+ double wt;
1662
+ if (idx_ < sk_->h_) {
1663
+ wt = sk_->weights_[idx_];
1664
+ } else if (idx_ == final_idx_ - 1) {
1665
+ wt = sk_->total_wt_r_ - cum_r_weight_;
1666
+ } else {
1667
+ wt = r_item_wt_;
1668
+ }
1669
+ return std::pair<T&, double>(sk_->data_[idx_], wt);
1670
+ }
1671
+
1672
+ template<typename T, typename S, typename A>
1673
+ bool var_opt_sketch<T, S, A>::iterator::get_mark() const {
1674
+ return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
1675
+ }
1676
+
1677
+
1678
+
1679
+ // ******************** MOVE TO COMMON UTILS AREA EVENTUALLY *********************
1680
+
1681
+ namespace random_utils {
1682
+ static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
1683
+ static std::mt19937_64 rand(rd());
1684
+ static std::uniform_real_distribution<> next_double(0.0, 1.0);
1685
+ }
1686
+
1687
+ /**
1688
+ * Checks if target sampling allocation is more than 50% of max sampling size.
1689
+ * If so, returns max sampling size, otherwise passes through target size.
1690
+ */
1691
+ template<typename T, typename S, typename A>
1692
+ uint32_t var_opt_sketch<T,S,A>::get_adjusted_size(uint32_t max_size, uint32_t resize_target) {
1693
+ if (max_size - (resize_target << 1) < 0L) {
1694
+ return max_size;
1695
+ }
1696
+ return resize_target;
1697
+ }
1698
+
1699
+ template<typename T, typename S, typename A>
1700
+ uint32_t var_opt_sketch<T,S,A>::starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min) {
1701
+ return (lg_target <= lg_min)
1702
+ ? lg_min : (lg_rf == 0) ? lg_target
1703
+ : (lg_target - lg_min) % lg_rf + lg_min;
1704
+ }
1705
+
1706
+ template<typename T, typename S, typename A>
1707
+ double var_opt_sketch<T,S,A>::pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate) {
1708
+ const double adjusted_kappa = DEFAULT_KAPPA * sqrt(1 - sampling_rate);
1709
+ return bounds_binomial_proportions::approximate_upper_bound_on_p(n, k, adjusted_kappa);
1710
+ }
1711
+
1712
+ template<typename T, typename S, typename A>
1713
+ double var_opt_sketch<T,S,A>::pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate) {
1714
+ const double adjusted_kappa = DEFAULT_KAPPA * sqrt(1 - sampling_rate);
1715
+ return bounds_binomial_proportions::approximate_lower_bound_on_p(n, k, adjusted_kappa);
1716
+ }
1717
+
1718
+ template<typename T, typename S, typename A>
1719
+ bool var_opt_sketch<T,S,A>::is_power_of_2(uint32_t v) {
1720
+ return v && !(v & (v - 1));
1721
+ }
1722
+
1723
+ template<typename T, typename S, typename A>
1724
+ uint32_t var_opt_sketch<T,S,A>::to_log_2(uint32_t v) {
1725
+ if (is_power_of_2(v)) {
1726
+ return count_trailing_zeros_in_u32(v);
1727
+ } else {
1728
+ throw std::invalid_argument("Attempt to compute integer log2 of non-positive or non-power of 2");
1729
+ }
1730
+ }
1731
+
1732
+ // Returns an integer in the range [0, max_value) -- excludes max_value
1733
+ template<typename T, typename S, typename A>
1734
+ uint32_t var_opt_sketch<T,S,A>::next_int(uint32_t max_value) {
1735
+ std::uniform_int_distribution<uint32_t> dist(0, max_value - 1);
1736
+ return dist(random_utils::rand);
1737
+ }
1738
+
1739
+ template<typename T, typename S, typename A>
1740
+ double var_opt_sketch<T,S,A>::next_double_exclude_zero() {
1741
+ double r = random_utils::next_double(random_utils::rand);
1742
+ while (r == 0.0) {
1743
+ r = random_utils::next_double(random_utils::rand);
1744
+ }
1745
+ return r;
1746
+ }
1747
+
1748
+ }
1749
+
1750
+ // namespace datasketches
1751
+
1752
+ #endif // _VAR_OPT_SKETCH_IMPL_HPP_