datasketches 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,48 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(sampling INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::SAMPLING ALIAS sampling)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(sampling
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(sampling INTERFACE common)
33
+ target_compile_features(sampling INTERFACE cxx_std_11)
34
+
35
+ set(sampling_HEADERS "include/var_opt_sketch.hpp;include/var_opt_sketch_impl.hpp")
36
+
37
+ install(TARGETS sampling
38
+ EXPORT ${PROJECT_NAME}
39
+ )
40
+
41
+ install(FILES ${sampling_HEADERS}
42
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
43
+
44
+ target_sources(sampling
45
+ INTERFACE
46
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch.hpp
47
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch_impl.hpp
48
+ )
@@ -0,0 +1,392 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _VAR_OPT_SKETCH_HPP_
21
+ #define _VAR_OPT_SKETCH_HPP_
22
+
23
+ #include "serde.hpp"
24
+ #include "common_defs.hpp"
25
+
26
+ #include <iterator>
27
+ #include <vector>
28
+
29
+
30
+ /**
31
+ * This sketch samples data from a stream of items, designed for optimal (minimum) variance when
32
+ * querying the sketch to estimate subset sums of items matchng a provided predicate. Variance
33
+ * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
34
+ * subset sum estimation.
35
+ *
36
+ * author Kevin Lang
37
+ * author Jon Malkin
38
+ */
39
+ namespace datasketches {
40
+
41
+ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
42
+ template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
43
+
44
+ /**
45
+ * A struct to hold the result of subset sum queries
46
+ */
47
+ struct subset_summary {
48
+ double lower_bound;
49
+ double estimate;
50
+ double upper_bound;
51
+ double total_sketch_weight;
52
+ };
53
+
54
+ enum resize_factor { X1 = 0, X2, X4, X8 };
55
+
56
+ template <typename T, typename S, typename A> class var_opt_union; // forward declaration
57
+
58
+ template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
59
+ class var_opt_sketch {
60
+
61
+ public:
62
+ static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
63
+ static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
64
+
65
+ explicit var_opt_sketch(uint32_t k, resize_factor rf = DEFAULT_RESIZE_FACTOR);
66
+ var_opt_sketch(const var_opt_sketch& other);
67
+ var_opt_sketch(var_opt_sketch&& other) noexcept;
68
+
69
+ ~var_opt_sketch();
70
+
71
+ var_opt_sketch& operator=(const var_opt_sketch& other);
72
+ var_opt_sketch& operator=(var_opt_sketch&& other);
73
+
74
+ /**
75
+ * Updates this sketch with the given data item with the given weight.
76
+ * This method takes an lvalue.
77
+ * @param item an item from a stream of items
78
+ * @param weight the weight of the item
79
+ */
80
+ void update(const T& item, double weight=1.0);
81
+
82
+ /**
83
+ * Updates this sketch with the given data item with the given weight.
84
+ * This method takes an rvalue.
85
+ * @param item an item from a stream of items
86
+ * @param weight the weight of the item
87
+ */
88
+ void update(T&& item, double weight=1.0);
89
+
90
+ /**
91
+ * Returns the configured maximum sample size.
92
+ * @return configured maximum sample size
93
+ */
94
+ inline uint32_t get_k() const;
95
+
96
+ /**
97
+ * Returns the length of the input stream.
98
+ * @return stream length
99
+ */
100
+ inline uint64_t get_n() const;
101
+
102
+ /**
103
+ * Returns the number of samples currently in the sketch
104
+ * @return stream length
105
+ */
106
+ inline uint32_t get_num_samples() const;
107
+
108
+ /**
109
+ * Computes an estimated subset sum from the entire stream for objects matching a given
110
+ * predicate. Provides a lower bound, estimate, and upper bound using a target of 2 standard
111
+ * deviations. This is technically a heuristic method and tries to err on the conservative side.
112
+ * @param P a predicate function
113
+ * @return a subset_summary item with estimate, upper and lower bounds,
114
+ * and total sketch weight
115
+ */
116
+ template<typename P>
117
+ subset_summary estimate_subset_sum(P predicate) const;
118
+
119
+ /**
120
+ * Returns true if the sketch is empty.
121
+ * @return empty flag
122
+ */
123
+ inline bool is_empty() const;
124
+
125
+ /**
126
+ * Resets the sketch to its default, empty state.
127
+ */
128
+ void reset();
129
+
130
+ /**
131
+ * Computes size needed to serialize the current state of the sketch.
132
+ * This version is for fixed-size arithmetic types (integral and floating point).
133
+ * @return size in bytes needed to serialize this sketch
134
+ */
135
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
136
+ inline size_t get_serialized_size_bytes() const;
137
+
138
+ /**
139
+ * Computes size needed to serialize the current state of the sketch.
140
+ * This version is for all other types and can be expensive since every item needs to be looked at.
141
+ * @return size in bytes needed to serialize this sketch
142
+ */
143
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
144
+ inline size_t get_serialized_size_bytes() const;
145
+
146
+ // This is a convenience alias for users
147
+ // The type returned by the following serialize method
148
+ typedef vector_u8<A> vector_bytes;
149
+
150
+ /**
151
+ * This method serializes the sketch as a vector of bytes.
152
+ * An optional header can be reserved in front of the sketch.
153
+ * It is a blank space of a given size.
154
+ * This header is used in Datasketches PostgreSQL extension.
155
+ * @param header_size_bytes space to reserve in front of the sketch
156
+ */
157
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
158
+
159
+ /**
160
+ * This method serializes the sketch into a given stream in a binary form
161
+ * @param os output stream
162
+ */
163
+ void serialize(std::ostream& os) const;
164
+
165
+ /**
166
+ * This method deserializes a sketch from a given stream.
167
+ * @param is input stream
168
+ * @return an instance of a sketch
169
+ */
170
+ static var_opt_sketch deserialize(std::istream& is);
171
+
172
+ /**
173
+ * This method deserializes a sketch from a given array of bytes.
174
+ * @param bytes pointer to the array of bytes
175
+ * @param size the size of the array
176
+ * @return an instance of a sketch
177
+ */
178
+ static var_opt_sketch deserialize(const void* bytes, size_t size);
179
+
180
+ /**
181
+ * Prints a summary of the sketch.
182
+ * @return the summary as a string
183
+ */
184
+ string<A> to_string() const;
185
+
186
+ /**
187
+ * Prints the raw sketch items to a string. Calls items_to_stream() internally.
188
+ * Only works for type T with a defined operator<<() and
189
+ * kept separate from to_string() to allow compilation even if
190
+ * T does not have such an operator defined.
191
+ * @return a string with the sketch items
192
+ */
193
+ string<A> items_to_string() const;
194
+
195
+ class const_iterator;
196
+ const_iterator begin() const;
197
+ const_iterator end() const;
198
+
199
+ private:
200
+ typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
201
+ typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
202
+
203
+ static const uint32_t MIN_LG_ARR_ITEMS = 3;
204
+
205
+ static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
206
+ static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
207
+ static const uint8_t PREAMBLE_LONGS_FULL = 4;
208
+ static const uint8_t SER_VER = 2;
209
+ static const uint8_t FAMILY_ID = 13;
210
+ static const uint8_t EMPTY_FLAG_MASK = 4;
211
+ static const uint8_t GADGET_FLAG_MASK = 128;
212
+
213
+ // Number of standard deviations to use for subset sum error bounds
214
+ constexpr static const double DEFAULT_KAPPA = 2.0;
215
+
216
+ // TODO: should probably rearrange a bit to minimize gaps once aligned
217
+ uint32_t k_; // max size of sketch, in items
218
+
219
+ uint32_t h_; // number of items in heap
220
+ uint32_t m_; // number of items in middle region
221
+ uint32_t r_; // number of items in reservoir-like region
222
+
223
+ uint64_t n_; // total number of items processed by sketch
224
+ double total_wt_r_; // total weight of items in reservoir-like area
225
+
226
+ resize_factor rf_; // resize factor
227
+
228
+ uint32_t curr_items_alloc_; // currently allocated array size
229
+ bool filled_data_; // true if we've explciitly set all entries in data_
230
+
231
+ T* data_; // stored sampled items
232
+ double* weights_; // weights for sampled items
233
+
234
+ // The next two fields are hidden from the user because they are part of the state of the
235
+ // unioning algorithm, NOT part of a varopt sketch, or even of a varopt "gadget" (our name for
236
+ // the potentially invalid sketch that is maintained by the unioning algorithm). It would make
237
+ // more sense logically for these fields to be declared in the unioning object (whose entire
238
+ // purpose is storing the state of the unioning algorithm) but for reasons of programming
239
+ // convenience we are currently declaring them here. However, that could change in the future.
240
+
241
+ // Following int is:
242
+ // 1. Zero (for a varopt sketch)
243
+ // 2. Count of marked items in H region, if part of a unioning algo's gadget
244
+ uint32_t num_marks_in_h_;
245
+
246
+ // The following array is absent in a varopt sketch, and notionally present in a gadget
247
+ // (although it really belongs in the unioning object). If the array were to be made explicit,
248
+ // some additional coding would need to be done to ensure that all of the necessary data motion
249
+ // occurs and is properly tracked.
250
+ bool* marks_;
251
+
252
+ // used during deserialization to avoid memork leaks upon errors
253
+ class items_deleter;
254
+ class weights_deleter;
255
+ class marks_deleter;
256
+
257
+ var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget);
258
+ var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
259
+ uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
260
+ std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
261
+ std::unique_ptr<bool, marks_deleter> marks);
262
+
263
+ friend class var_opt_union<T,S,A>;
264
+ var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n);
265
+ var_opt_sketch(T* data, double* weights, size_t len, uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r);
266
+
267
+ string<A> items_to_string(bool print_gap) const;
268
+
269
+ // internal-use-only update
270
+ template<typename O>
271
+ inline void update(O&& item, double weight, bool mark);
272
+
273
+ template<typename O>
274
+ inline void update_warmup_phase(O&& item, double weight, bool mark);
275
+
276
+ template<typename O>
277
+ inline void update_light(O&& item, double weight, bool mark);
278
+
279
+ template<typename O>
280
+ inline void update_heavy_r_eq1(O&& item, double weight, bool mark);
281
+
282
+ template<typename O>
283
+ inline void update_heavy_general(O&& item, double weight, bool mark);
284
+
285
+ inline double get_tau() const;
286
+ inline double peek_min() const;
287
+ inline bool is_marked(uint32_t idx) const;
288
+
289
+ inline uint32_t pick_random_slot_in_r() const;
290
+ inline uint32_t choose_delete_slot(double wt_cand, uint32_t num_cand) const;
291
+ inline uint32_t choose_weighted_delete_slot(double wt_cand, uint32_t num_cand) const;
292
+
293
+ template<typename O>
294
+ inline void push(O&& item, double wt, bool mark);
295
+ inline void transition_from_warmup();
296
+ inline void convert_to_heap();
297
+ inline void restore_towards_leaves(uint32_t slot_in);
298
+ inline void restore_towards_root(uint32_t slot_in);
299
+ inline void pop_min_to_m_region();
300
+ void grow_candidate_set(double wt_cands, uint32_t num_cands);
301
+ void decrease_k_by_1();
302
+ void strip_marks();
303
+ void force_set_k(uint32_t k); // used to resolve union gadget into sketch
304
+ void downsample_candidate_set(double wt_cands, uint32_t num_cands);
305
+ inline void swap_values(uint32_t src, uint32_t dst);
306
+ void grow_data_arrays();
307
+ void allocate_data_arrays(uint32_t tgt_size, bool use_marks);
308
+
309
+ // validation
310
+ static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
311
+ static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
312
+ static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
313
+ uint32_t h, uint32_t r, resize_factor rf);
314
+
315
+ // things to move to common and be shared among sketches
316
+ static uint32_t get_adjusted_size(uint32_t max_size, uint32_t resize_target);
317
+ static uint32_t starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min);
318
+ static inline double pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate);
319
+ static inline double pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate);
320
+ static bool is_power_of_2(uint32_t v);
321
+ static uint32_t to_log_2(uint32_t v);
322
+ static inline uint32_t next_int(uint32_t max_value);
323
+ static inline double next_double_exclude_zero();
324
+
325
+ class iterator;
326
+ };
327
+
328
+ template<typename T, typename S, typename A>
329
+ class var_opt_sketch<T, S, A>::const_iterator : public std::iterator<std::input_iterator_tag, T> {
330
+ public:
331
+ const_iterator(const const_iterator& other);
332
+ const_iterator& operator++();
333
+ const_iterator& operator++(int);
334
+ bool operator==(const const_iterator& other) const;
335
+ bool operator!=(const const_iterator& other) const;
336
+ const std::pair<const T&, const double> operator*() const;
337
+
338
+ private:
339
+ friend class var_opt_sketch<T,S,A>;
340
+ friend class var_opt_union<T,S,A>;
341
+
342
+ // default iterator over full sketch
343
+ const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end);
344
+
345
+ // iterates over only one of the H or R region, optionally applying weight correction
346
+ // to R region (can correct for numerical precision issues)
347
+ const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region);
348
+
349
+ bool get_mark() const;
350
+
351
+ const var_opt_sketch<T,S,A>* sk_;
352
+ double cum_r_weight_; // used for weight correction
353
+ double r_item_wt_;
354
+ size_t idx_;
355
+ const size_t final_idx_;
356
+ bool weight_correction_;
357
+ };
358
+
359
+ // non-const iterator for internal use
360
+ template<typename T, typename S, typename A>
361
+ class var_opt_sketch<T, S, A>::iterator : public std::iterator<std::input_iterator_tag, T> {
362
+ public:
363
+ iterator(const iterator& other);
364
+ iterator& operator++();
365
+ iterator& operator++(int);
366
+ bool operator==(const iterator& other) const;
367
+ bool operator!=(const iterator& other) const;
368
+ std::pair<T&, double> operator*();
369
+
370
+ private:
371
+ friend class var_opt_sketch<T,S,A>;
372
+ friend class var_opt_union<T,S,A>;
373
+
374
+ // iterates over only one of the H or R region, applying weight correction
375
+ // if iterating over R region (can correct for numerical precision issues)
376
+ iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region);
377
+
378
+ bool get_mark() const;
379
+
380
+ const var_opt_sketch<T,S,A>* sk_;
381
+ double cum_r_weight_; // used for weight correction
382
+ double r_item_wt_;
383
+ size_t idx_;
384
+ const size_t final_idx_;
385
+ };
386
+
387
+
388
+ } // namespace datasketches
389
+
390
+ #include "var_opt_sketch_impl.hpp"
391
+
392
+ #endif // _VAR_OPT_SKETCH_HPP_
@@ -0,0 +1,1752 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _VAR_OPT_SKETCH_IMPL_HPP_
21
+ #define _VAR_OPT_SKETCH_IMPL_HPP_
22
+
23
+ #include <memory>
24
+ #include <sstream>
25
+ #include <cmath>
26
+ #include <random>
27
+ #include <algorithm>
28
+
29
+ #include "var_opt_sketch.hpp"
30
+ #include "serde.hpp"
31
+ #include "bounds_binomial_proportions.hpp"
32
+ #include "count_zeros.hpp"
33
+ #include "memory_operations.hpp"
34
+ #include "ceiling_power_of_2.hpp"
35
+
36
+ namespace datasketches {
37
+
38
+ /**
39
+ * Implementation code for the VarOpt sketch.
40
+ *
41
+ * author Kevin Lang
42
+ * author Jon Malkin
43
+ */
44
+ template<typename T, typename S, typename A>
45
+ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf) :
46
+ var_opt_sketch<T,S,A>(k, rf, false) {}
47
+
48
+ template<typename T, typename S, typename A>
49
+ var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other) :
50
+ k_(other.k_),
51
+ h_(other.h_),
52
+ m_(other.m_),
53
+ r_(other.r_),
54
+ n_(other.n_),
55
+ total_wt_r_(other.total_wt_r_),
56
+ rf_(other.rf_),
57
+ curr_items_alloc_(other.curr_items_alloc_),
58
+ filled_data_(other.filled_data_),
59
+ data_(nullptr),
60
+ weights_(nullptr),
61
+ num_marks_in_h_(other.num_marks_in_h_),
62
+ marks_(nullptr)
63
+ {
64
+ data_ = A().allocate(curr_items_alloc_);
65
+ // skip gap or anything unused at the end
66
+ for (size_t i = 0; i < h_; ++i)
67
+ new (&data_[i]) T(other.data_[i]);
68
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
69
+ new (&data_[i]) T(other.data_[i]);
70
+
71
+ // we skipped the gap
72
+ filled_data_ = false;
73
+
74
+ weights_ = AllocDouble().allocate(curr_items_alloc_);
75
+ // doubles so can successfully copy regardless of the internal state
76
+ std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
77
+
78
+ if (other.marks_ != nullptr) {
79
+ marks_ = AllocBool().allocate(curr_items_alloc_);
80
+ std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
81
+ }
82
+ }
83
+
84
+ template<typename T, typename S, typename A>
85
+ var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n) :
86
+ k_(other.k_),
87
+ h_(other.h_),
88
+ m_(other.m_),
89
+ r_(other.r_),
90
+ n_(adjusted_n),
91
+ total_wt_r_(other.total_wt_r_),
92
+ rf_(other.rf_),
93
+ curr_items_alloc_(other.curr_items_alloc_),
94
+ filled_data_(other.filled_data_),
95
+ data_(nullptr),
96
+ weights_(nullptr),
97
+ num_marks_in_h_(other.num_marks_in_h_),
98
+ marks_(nullptr)
99
+ {
100
+ data_ = A().allocate(curr_items_alloc_);
101
+ // skip gap or anything unused at the end
102
+ for (size_t i = 0; i < h_; ++i)
103
+ new (&data_[i]) T(other.data_[i]);
104
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
105
+ new (&data_[i]) T(other.data_[i]);
106
+
107
+ // we skipped the gap
108
+ filled_data_ = false;
109
+
110
+ weights_ = AllocDouble().allocate(curr_items_alloc_);
111
+ // doubles so can successfully copy regardless of the internal state
112
+ std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
113
+
114
+ if (!as_sketch && other.marks_ != nullptr) {
115
+ marks_ = AllocBool().allocate(curr_items_alloc_);
116
+ std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
117
+ }
118
+ }
119
+
120
+ template<typename T, typename S, typename A>
121
+ var_opt_sketch<T,S,A>::var_opt_sketch(T* data, double* weights, size_t len,
122
+ uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r) :
123
+ k_(k),
124
+ h_(h_count),
125
+ m_(0),
126
+ r_(r_count),
127
+ n_(n),
128
+ total_wt_r_(total_wt_r),
129
+ rf_(DEFAULT_RESIZE_FACTOR),
130
+ curr_items_alloc_(len),
131
+ filled_data_(n > k),
132
+ data_(data),
133
+ weights_(weights),
134
+ num_marks_in_h_(0),
135
+ marks_(nullptr)
136
+ {}
137
+
138
+ template<typename T, typename S, typename A>
139
+ var_opt_sketch<T,S,A>::var_opt_sketch(var_opt_sketch&& other) noexcept :
140
+ k_(other.k_),
141
+ h_(other.h_),
142
+ m_(other.m_),
143
+ r_(other.r_),
144
+ n_(other.n_),
145
+ total_wt_r_(other.total_wt_r_),
146
+ rf_(other.rf_),
147
+ curr_items_alloc_(other.curr_items_alloc_),
148
+ filled_data_(other.filled_data_),
149
+ data_(other.data_),
150
+ weights_(other.weights_),
151
+ num_marks_in_h_(other.num_marks_in_h_),
152
+ marks_(other.marks_)
153
+ {
154
+ other.data_ = nullptr;
155
+ other.weights_ = nullptr;
156
+ other.marks_ = nullptr;
157
+ }
158
+
159
+ template<typename T, typename S, typename A>
160
+ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget) :
161
+ k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf) {
162
+ if (k == 0 || k_ > MAX_K) {
163
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
164
+ }
165
+
166
+ uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
167
+ uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS);
168
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
169
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
170
+ ++curr_items_alloc_;
171
+ }
172
+
173
+ allocate_data_arrays(curr_items_alloc_, is_gadget);
174
+ num_marks_in_h_ = 0;
175
+ }
176
+
177
+ template<typename T, typename S, typename A>
178
+ var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
179
+ uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
180
+ std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
181
+ std::unique_ptr<bool, marks_deleter> marks) :
182
+ k_(k),
183
+ h_(h),
184
+ m_(m),
185
+ r_(r),
186
+ n_(n),
187
+ total_wt_r_(total_wt_r),
188
+ rf_(rf),
189
+ curr_items_alloc_(curr_items_alloc),
190
+ filled_data_(filled_data),
191
+ data_(items.release()),
192
+ weights_(weights.release()),
193
+ num_marks_in_h_(num_marks_in_h),
194
+ marks_(marks.release())
195
+ {}
196
+
197
+
198
+ template<typename T, typename S, typename A>
199
+ var_opt_sketch<T,S,A>::~var_opt_sketch() {
200
+ if (data_ != nullptr) {
201
+ if (filled_data_) {
202
+ // destroy everything
203
+ const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_);
204
+ for (size_t i = 0; i < num_to_destroy; ++i) {
205
+ A().destroy(data_ + i);
206
+ }
207
+ } else {
208
+ // skip gap or anything unused at the end
209
+ for (size_t i = 0; i < h_; ++i) {
210
+ A().destroy(data_+ i);
211
+ }
212
+
213
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
214
+ A().destroy(data_ + i);
215
+ }
216
+ }
217
+ A().deallocate(data_, curr_items_alloc_);
218
+ }
219
+
220
+ if (weights_ != nullptr) {
221
+ AllocDouble().deallocate(weights_, curr_items_alloc_);
222
+ }
223
+
224
+ if (marks_ != nullptr) {
225
+ AllocBool().deallocate(marks_, curr_items_alloc_);
226
+ }
227
+ }
228
+
229
+ template<typename T, typename S, typename A>
230
+ var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(const var_opt_sketch& other) {
231
+ var_opt_sketch<T,S,A> sk_copy(other);
232
+ std::swap(k_, sk_copy.k_);
233
+ std::swap(h_, sk_copy.h_);
234
+ std::swap(m_, sk_copy.m_);
235
+ std::swap(r_, sk_copy.r_);
236
+ std::swap(n_, sk_copy.n_);
237
+ std::swap(total_wt_r_, sk_copy.total_wt_r_);
238
+ std::swap(rf_, sk_copy.rf_);
239
+ std::swap(curr_items_alloc_, sk_copy.curr_items_alloc_);
240
+ std::swap(filled_data_, sk_copy.filled_data_);
241
+ std::swap(data_, sk_copy.data_);
242
+ std::swap(weights_, sk_copy.weights_);
243
+ std::swap(num_marks_in_h_, sk_copy.num_marks_in_h_);
244
+ std::swap(marks_, sk_copy.marks_);
245
+ return *this;
246
+ }
247
+
248
+ template<typename T, typename S, typename A>
249
+ var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(var_opt_sketch&& other) {
250
+ std::swap(k_, other.k_);
251
+ std::swap(h_, other.h_);
252
+ std::swap(m_, other.m_);
253
+ std::swap(r_, other.r_);
254
+ std::swap(n_, other.n_);
255
+ std::swap(total_wt_r_, other.total_wt_r_);
256
+ std::swap(rf_, other.rf_);
257
+ std::swap(curr_items_alloc_, other.curr_items_alloc_);
258
+ std::swap(filled_data_, other.filled_data_);
259
+ std::swap(data_, other.data_);
260
+ std::swap(weights_, other.weights_);
261
+ std::swap(num_marks_in_h_, other.num_marks_in_h_);
262
+ std::swap(marks_, other.marks_);
263
+ return *this;
264
+ }
265
+
266
+ /*
267
+ * An empty sketch requires 8 bytes.
268
+ *
269
+ * <pre>
270
+ * Long || Start Byte Adr:
271
+ * Adr:
272
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
273
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
274
+ * </pre>
275
+ *
276
+ * A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are
277
+ * at least k items the sketch uses 32 bytes of preamble.
278
+ *
279
+ * The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent
280
+ * unused preamble bits. The acceptance probability for an item is a double in the range [0,1),
281
+ * limiting us to 53 bits of randomness due to details of the IEEE floating point format. To
282
+ * ensure meaningful probabilities as the items seen count approaches capacity, we intentionally
283
+ * use slightly fewer bits.
284
+ *
285
+ * Following the header are weights for the heavy items, then marks in the event this is a gadget.
286
+ * The serialized items come last.
287
+ *
288
+ * <pre>
289
+ * Long || Start Byte Adr:
290
+ * Adr:
291
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
292
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
293
+ *
294
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
295
+ * 1 ||---------------------------Items Seen Count (N)--------------------------------|
296
+ *
297
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
298
+ * 2 ||-------------Item Count in H---------------|-------Item Count in R-------------|
299
+ *
300
+ * || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
301
+ * 3 ||-------------------------------Total Weight in R-------------------------------|
302
+ * </pre>
303
+ */
304
+
305
+ // implementation for fixed-size arithmetic types (integral and floating point)
306
+ template<typename T, typename S, typename A>
307
+ template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
308
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
309
+ if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
310
+ size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
311
+ num_bytes += h_ * sizeof(double); // weights
312
+ if (marks_ != nullptr) { // marks
313
+ num_bytes += (h_ / 8) + (h_ % 8 > 0);
314
+ }
315
+ num_bytes += (h_ + r_) * sizeof(TT); // the actual items
316
+ return num_bytes;
317
+ }
318
+
319
+ // implementation for all other types
320
+ template<typename T, typename S, typename A>
321
+ template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
322
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
323
+ if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
324
+ size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
325
+ num_bytes += h_ * sizeof(double); // weights
326
+ if (marks_ != nullptr) { // marks
327
+ num_bytes += (h_ / 8) + (h_ % 8 > 0);
328
+ }
329
+ // must iterate over the items
330
+ for (auto& it: *this)
331
+ num_bytes += S().size_of_item(it.first);
332
+ return num_bytes;
333
+ }
334
+
335
+ template<typename T, typename S, typename A>
336
+ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes) const {
337
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
338
+ std::vector<uint8_t, AllocU8<A>> bytes(size);
339
+ uint8_t* ptr = bytes.data() + header_size_bytes;
340
+ uint8_t* end_ptr = ptr + size;
341
+
342
+ bool empty = is_empty();
343
+ uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
344
+ : (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL));
345
+ uint8_t first_byte = (preLongs & 0x3F) | ((static_cast<uint8_t>(rf_)) << 6);
346
+ uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0);
347
+
348
+ if (empty) {
349
+ flags |= EMPTY_FLAG_MASK;
350
+ }
351
+
352
+ // first prelong
353
+ uint8_t ser_ver(SER_VER);
354
+ uint8_t family(FAMILY_ID);
355
+ ptr += copy_to_mem(&first_byte, ptr, sizeof(uint8_t));
356
+ ptr += copy_to_mem(&ser_ver, ptr, sizeof(uint8_t));
357
+ ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
358
+ ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
359
+ ptr += copy_to_mem(&k_, ptr, sizeof(uint32_t));
360
+
361
+ if (!empty) {
362
+ // second and third prelongs
363
+ ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
364
+ ptr += copy_to_mem(&h_, ptr, sizeof(uint32_t));
365
+ ptr += copy_to_mem(&r_, ptr, sizeof(uint32_t));
366
+
367
+ // fourth prelong, if needed
368
+ if (r_ > 0) {
369
+ ptr += copy_to_mem(&total_wt_r_, ptr, sizeof(double));
370
+ }
371
+
372
+ // first h_ weights
373
+ ptr += copy_to_mem(weights_, ptr, h_ * sizeof(double));
374
+
375
+ // first h_ marks as packed bytes iff we have a gadget
376
+ if (marks_ != nullptr) {
377
+ uint8_t val = 0;
378
+ for (uint32_t i = 0; i < h_; ++i) {
379
+ if (marks_[i]) {
380
+ val |= 0x1 << (i & 0x7);
381
+ }
382
+
383
+ if ((i & 0x7) == 0x7) {
384
+ ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
385
+ val = 0;
386
+ }
387
+ }
388
+
389
+ // write out any remaining values
390
+ if ((h_ & 0x7) > 0) {
391
+ ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
392
+ }
393
+ }
394
+
395
+ // write the sample items, skipping the gap. Either h_ or r_ may be 0
396
+ ptr += S().serialize(ptr, end_ptr - ptr, data_, h_);
397
+ ptr += S().serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
398
+ }
399
+
400
+ size_t bytes_written = ptr - bytes.data();
401
+ if (bytes_written != size) {
402
+ throw std::logic_error("serialized size mismatch: " + std::to_string(bytes_written) + " != " + std::to_string(size));
403
+ }
404
+
405
+ return bytes;
406
+ }
407
+
408
+ template<typename T, typename S, typename A>
409
+ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
410
+ const bool empty = (h_ == 0) && (r_ == 0);
411
+
412
+ const uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
413
+ : (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL));
414
+ const uint8_t first_byte = (preLongs & 0x3F) | ((static_cast<uint8_t>(rf_)) << 6);
415
+ uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0);
416
+
417
+ if (empty) {
418
+ flags |= EMPTY_FLAG_MASK;
419
+ }
420
+
421
+ // first prelong
422
+ const uint8_t ser_ver(SER_VER);
423
+ const uint8_t family(FAMILY_ID);
424
+ os.write((char*)&first_byte, sizeof(uint8_t));
425
+ os.write((char*)&ser_ver, sizeof(uint8_t));
426
+ os.write((char*)&family, sizeof(uint8_t));
427
+ os.write((char*)&flags, sizeof(uint8_t));
428
+ os.write((char*)&k_, sizeof(uint32_t));
429
+
430
+ if (!empty) {
431
+ // second and third prelongs
432
+ os.write((char*)&n_, sizeof(uint64_t));
433
+ os.write((char*)&h_, sizeof(uint32_t));
434
+ os.write((char*)&r_, sizeof(uint32_t));
435
+
436
+ // fourth prelong, if needed
437
+ if (r_ > 0) {
438
+ os.write((char*)&total_wt_r_, sizeof(double));
439
+ }
440
+
441
+ // write the first h_ weights
442
+ os.write((char*)weights_, h_ * sizeof(double));
443
+
444
+ // write the first h_ marks as packed bytes iff we have a gadget
445
+ if (marks_ != nullptr) {
446
+ uint8_t val = 0;
447
+ for (uint32_t i = 0; i < h_; ++i) {
448
+ if (marks_[i]) {
449
+ val |= 0x1 << (i & 0x7);
450
+ }
451
+
452
+ if ((i & 0x7) == 0x7) {
453
+ os.write((char*)&val, sizeof(uint8_t));
454
+ val = 0;
455
+ }
456
+ }
457
+
458
+ // write out any remaining values
459
+ if ((h_ & 0x7) > 0) {
460
+ os.write((char*)&val, sizeof(uint8_t));
461
+ }
462
+ }
463
+
464
+ // write the sample items, skipping the gap. Either h_ or r_ may be 0
465
+ S().serialize(os, data_, h_);
466
+ S().serialize(os, &data_[h_ + 1], r_);
467
+ }
468
+ }
469
+
470
+ template<typename T, typename S, typename A>
471
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size) {
472
+ ensure_minimum_memory(size, 8);
473
+ const char* ptr = static_cast<const char*>(bytes);
474
+ const char* base = ptr;
475
+ const char* end_ptr = ptr + size;
476
+ uint8_t first_byte;
477
+ ptr += copy_from_mem(ptr, &first_byte, sizeof(first_byte));
478
+ uint8_t preamble_longs = first_byte & 0x3f;
479
+ resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
480
+ uint8_t serial_version;
481
+ ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
482
+ uint8_t family_id;
483
+ ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
484
+ uint8_t flags;
485
+ ptr += copy_from_mem(ptr, &flags, sizeof(flags));
486
+ uint32_t k;
487
+ ptr += copy_from_mem(ptr, &k, sizeof(k));
488
+
489
+ check_preamble_longs(preamble_longs, flags);
490
+ check_family_and_serialization_version(family_id, serial_version);
491
+ ensure_minimum_memory(size, preamble_longs << 3);
492
+
493
+ const bool is_empty = flags & EMPTY_FLAG_MASK;
494
+ const bool is_gadget = flags & GADGET_FLAG_MASK;
495
+
496
+ if (is_empty) {
497
+ return var_opt_sketch<T,S,A>(k, rf, is_gadget);
498
+ }
499
+
500
+ // second and third prelongs
501
+ uint64_t n;
502
+ uint32_t h, r;
503
+ ptr += copy_from_mem(ptr, &n, sizeof(n));
504
+ ptr += copy_from_mem(ptr, &h, sizeof(h));
505
+ ptr += copy_from_mem(ptr, &r, sizeof(r));
506
+
507
+ const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
508
+
509
+ // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
510
+ double total_wt_r = 0.0;
511
+ if (preamble_longs == PREAMBLE_LONGS_FULL) {
512
+ ptr += copy_from_mem(ptr, &total_wt_r, sizeof(total_wt_r));
513
+ if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
514
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
515
+ "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
516
+ }
517
+ } else {
518
+ total_wt_r = 0.0;
519
+ }
520
+
521
+ // read the first h_ weights, fill in rest of array with -1.0
522
+ check_memory_size(ptr - base + (h * sizeof(double)), size);
523
+ std::unique_ptr<double, weights_deleter> weights(AllocDouble().allocate(array_size), weights_deleter(array_size));
524
+ double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
525
+ ptr += copy_from_mem(ptr, wts, h * sizeof(double));
526
+ for (size_t i = 0; i < h; ++i) {
527
+ if (!(wts[i] > 0.0)) {
528
+ throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
529
+ }
530
+ }
531
+ std::fill(&wts[h], &wts[array_size], -1.0);
532
+
533
+ // read the first h_ marks as packed bytes iff we have a gadget
534
+ uint32_t num_marks_in_h = 0;
535
+ std::unique_ptr<bool, marks_deleter> marks(nullptr, marks_deleter(array_size));
536
+ if (is_gadget) {
537
+ uint8_t val = 0;
538
+ marks = std::unique_ptr<bool, marks_deleter>(AllocBool().allocate(array_size), marks_deleter(array_size));
539
+ const size_t size_marks = (h / 8) + (h % 8 > 0 ? 1 : 0);
540
+ check_memory_size(ptr - base + size_marks, size);
541
+ for (uint32_t i = 0; i < h; ++i) {
542
+ if ((i & 0x7) == 0x0) { // should trigger on first iteration
543
+ ptr += copy_from_mem(ptr, &val, sizeof(val));
544
+ }
545
+ marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
546
+ num_marks_in_h += (marks.get()[i] ? 1 : 0);
547
+ }
548
+ }
549
+
550
+ // read the sample items, skipping the gap. Either h_ or r_ may be 0
551
+ items_deleter deleter(array_size);
552
+ std::unique_ptr<T, items_deleter> items(A().allocate(array_size), deleter);
553
+
554
+ ptr += S().deserialize(ptr, end_ptr - ptr, items.get(), h);
555
+ items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
556
+
557
+ ptr += S().deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
558
+ items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
559
+
560
+ return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
561
+ std::move(items), std::move(weights), num_marks_in_h, std::move(marks));
562
+ }
563
+
564
+ template<typename T, typename S, typename A>
565
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is) {
566
+ uint8_t first_byte;
567
+ is.read((char*)&first_byte, sizeof(first_byte));
568
+ uint8_t preamble_longs = first_byte & 0x3f;
569
+ resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
570
+ uint8_t serial_version;
571
+ is.read((char*)&serial_version, sizeof(serial_version));
572
+ uint8_t family_id;
573
+ is.read((char*)&family_id, sizeof(family_id));
574
+ uint8_t flags;
575
+ is.read((char*)&flags, sizeof(flags));
576
+ uint32_t k;
577
+ is.read((char*)&k, sizeof(k));
578
+
579
+ check_preamble_longs(preamble_longs, flags);
580
+ check_family_and_serialization_version(family_id, serial_version);
581
+
582
+ const bool is_empty = flags & EMPTY_FLAG_MASK;
583
+ const bool is_gadget = flags & GADGET_FLAG_MASK;
584
+
585
+ if (is_empty) {
586
+ if (!is.good())
587
+ throw std::runtime_error("error reading from std::istream");
588
+ else
589
+ return var_opt_sketch<T,S,A>(k, rf, is_gadget);
590
+ }
591
+
592
+ // second and third prelongs
593
+ uint64_t n;
594
+ uint32_t h, r;
595
+ is.read((char*)&n, sizeof(n));
596
+ is.read((char*)&h, sizeof(h));
597
+ is.read((char*)&r, sizeof(r));
598
+
599
+ const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
600
+
601
+ // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
602
+ double total_wt_r = 0.0;
603
+ if (preamble_longs == PREAMBLE_LONGS_FULL) {
604
+ is.read((char*)&total_wt_r, sizeof(total_wt_r));
605
+ if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
606
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
607
+ "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
608
+ }
609
+ } else {
610
+ total_wt_r = 0.0;
611
+ }
612
+
613
+ // read the first h weights, fill remainder with -1.0
614
+ std::unique_ptr<double, weights_deleter> weights(AllocDouble().allocate(array_size), weights_deleter(array_size));
615
+ double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
616
+ is.read((char*)wts, h * sizeof(double));
617
+ for (size_t i = 0; i < h; ++i) {
618
+ if (!(wts[i] > 0.0)) {
619
+ throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
620
+ }
621
+ }
622
+ std::fill(&wts[h], &wts[array_size], -1.0);
623
+
624
+ // read the first h_ marks as packed bytes iff we have a gadget
625
+ uint32_t num_marks_in_h = 0;
626
+ std::unique_ptr<bool, marks_deleter> marks(nullptr, marks_deleter(array_size));
627
+ if (is_gadget) {
628
+ marks = std::unique_ptr<bool, marks_deleter>(AllocBool().allocate(array_size), marks_deleter(array_size));
629
+ uint8_t val = 0;
630
+ for (uint32_t i = 0; i < h; ++i) {
631
+ if ((i & 0x7) == 0x0) { // should trigger on first iteration
632
+ is.read((char*)&val, sizeof(val));
633
+ }
634
+ marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
635
+ num_marks_in_h += (marks.get()[i] ? 1 : 0);
636
+ }
637
+ }
638
+
639
+ // read the sample items, skipping the gap. Either h or r may be 0
640
+ items_deleter deleter(array_size);
641
+ std::unique_ptr<T, items_deleter> items(A().allocate(array_size), deleter);
642
+
643
+ S().deserialize(is, items.get(), h); // aka &data_[0]
644
+ items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
645
+
646
+ S().deserialize(is, &(items.get()[h + 1]), r);
647
+ items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
648
+
649
+ if (!is.good())
650
+ throw std::runtime_error("error reading from std::istream");
651
+
652
+ return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
653
+ std::move(items), std::move(weights), num_marks_in_h, std::move(marks));
654
+ }
655
+
656
+ template<typename T, typename S, typename A>
657
+ bool var_opt_sketch<T,S,A>::is_empty() const {
658
+ return (h_ == 0 && r_ == 0);
659
+ }
660
+
661
+ template<typename T, typename S, typename A>
662
+ void var_opt_sketch<T,S,A>::reset() {
663
+ const uint32_t prev_alloc = curr_items_alloc_;
664
+ const uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
665
+ const uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS);
666
+ curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
667
+ if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
668
+ ++curr_items_alloc_;
669
+ }
670
+
671
+ if (filled_data_) {
672
+ // destroy everything
673
+ const size_t num_to_destroy = std::min(k_ + 1, prev_alloc);
674
+ for (size_t i = 0; i < num_to_destroy; ++i)
675
+ A().destroy(data_ + i);
676
+ } else {
677
+ // skip gap or anything unused at the end
678
+ for (size_t i = 0; i < h_; ++i)
679
+ A().destroy(data_+ i);
680
+
681
+ for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
682
+ A().destroy(data_ + i);
683
+ }
684
+
685
+ if (curr_items_alloc_ < prev_alloc) {
686
+ const bool is_gadget = (marks_ != nullptr);
687
+
688
+ A().deallocate(data_, prev_alloc);
689
+ AllocDouble().deallocate(weights_, prev_alloc);
690
+
691
+ if (marks_ != nullptr)
692
+ AllocBool().deallocate(marks_, prev_alloc);
693
+
694
+ allocate_data_arrays(curr_items_alloc_, is_gadget);
695
+ }
696
+
697
+ n_ = 0;
698
+ h_ = 0;
699
+ m_ = 0;
700
+ r_ = 0;
701
+ num_marks_in_h_ = 0;
702
+ total_wt_r_ = 0.0;
703
+ filled_data_ = false;
704
+ }
705
+
706
+ template<typename T, typename S, typename A>
707
+ uint64_t var_opt_sketch<T,S,A>::get_n() const {
708
+ return n_;
709
+ }
710
+
711
+ template<typename T, typename S, typename A>
712
+ uint32_t var_opt_sketch<T,S,A>::get_k() const {
713
+ return k_;
714
+ }
715
+
716
+ template<typename T, typename S, typename A>
717
+ uint32_t var_opt_sketch<T,S,A>::get_num_samples() const {
718
+ const uint32_t num_in_sketch = h_ + r_;
719
+ return (num_in_sketch < k_ ? num_in_sketch : k_);
720
+ }
721
+
722
+ template<typename T, typename S, typename A>
723
+ void var_opt_sketch<T,S,A>::update(const T& item, double weight) {
724
+ update(item, weight, false);
725
+ }
726
+
727
+ template<typename T, typename S, typename A>
728
+ void var_opt_sketch<T,S,A>::update(T&& item, double weight) {
729
+ update(std::move(item), weight, false);
730
+ }
731
+
732
+ template<typename T, typename S, typename A>
733
+ string<A> var_opt_sketch<T,S,A>::to_string() const {
734
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
735
+ os << "### VarOpt SUMMARY: " << std::endl;
736
+ os << " k : " << k_ << std::endl;
737
+ os << " h : " << h_ << std::endl;
738
+ os << " r : " << r_ << std::endl;
739
+ os << " weight_r : " << total_wt_r_ << std::endl;
740
+ os << " Current size : " << curr_items_alloc_ << std::endl;
741
+ os << " Resize factor: " << (1 << rf_) << std::endl;
742
+ os << "### END SKETCH SUMMARY" << std::endl;
743
+ return os.str();
744
+ }
745
+
746
+ template<typename T, typename S, typename A>
747
+ string<A> var_opt_sketch<T,S,A>::items_to_string() const {
748
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
749
+ os << "### Sketch Items" << std::endl;
750
+ int idx = 0;
751
+ for (auto record : *this) {
752
+ os << idx << ": " << record.first << "\twt = " << record.second << std::endl;
753
+ ++idx;
754
+ }
755
+ return os.str();
756
+ }
757
+
758
+ template<typename T, typename S, typename A>
759
+ string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
760
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
761
+ os << "### Sketch Items" << std::endl;
762
+ const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
763
+ for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
764
+ if (i == h_ && print_gap) {
765
+ os << i << ": GAP" << std::endl;
766
+ ++display_idx;
767
+ } else {
768
+ os << i << ": " << data_[i] << "\twt = ";
769
+ if (weights_[i] == -1.0) {
770
+ os << get_tau() << "\t(-1.0)" << std::endl;
771
+ } else {
772
+ os << weights_[i] << std::endl;
773
+ }
774
+ ++display_idx;
775
+ }
776
+ }
777
+ return os.str();
778
+ }
779
+
780
+ template<typename T, typename S, typename A>
781
+ template<typename O>
782
+ void var_opt_sketch<T,S,A>::update(O&& item, double weight, bool mark) {
783
+ if (weight < 0.0 || std::isnan(weight) || std::isinf(weight)) {
784
+ throw std::invalid_argument("Item weights must be nonnegative and finite. Found: "
785
+ + std::to_string(weight));
786
+ } else if (weight == 0.0) {
787
+ return;
788
+ }
789
+ ++n_;
790
+
791
+ if (r_ == 0) {
792
+ // exact mode
793
+ update_warmup_phase(std::forward<O>(item), weight, mark);
794
+ } else {
795
+ // sketch is in estimation mode so we can make the following check,
796
+ // although very conservative to check every time
797
+ if ((h_ != 0) && (peek_min() < get_tau()))
798
+ throw std::logic_error("sketch not in valid estimation mode");
799
+
800
+ // what tau would be if deletion candidates turn out to be R plus the new item
801
+ // note: (r_ + 1) - 1 is intentional
802
+ const double hypothetical_tau = (weight + total_wt_r_) / ((r_ + 1) - 1);
803
+
804
+ // is new item's turn to be considered for reservoir?
805
+ const double condition1 = (h_ == 0) || (weight <= peek_min());
806
+
807
+ // is new item light enough for reservoir?
808
+ const double condition2 = weight < hypothetical_tau;
809
+
810
+ if (condition1 && condition2) {
811
+ update_light(std::forward<O>(item), weight, mark);
812
+ } else if (r_ == 1) {
813
+ update_heavy_r_eq1(std::forward<O>(item), weight, mark);
814
+ } else {
815
+ update_heavy_general(std::forward<O>(item), weight, mark);
816
+ }
817
+ }
818
+ }
819
+
820
+ template<typename T, typename S, typename A>
821
+ template<typename O>
822
+ void var_opt_sketch<T,S,A>::update_warmup_phase(O&& item, double weight, bool mark) {
823
+ // seems overly cautious
824
+ if (r_ > 0 || m_ != 0 || h_ > k_) throw std::logic_error("invalid sketch state during warmup");
825
+
826
+ if (h_ >= curr_items_alloc_) {
827
+ grow_data_arrays();
828
+ }
829
+
830
+ // store items as they come in until full
831
+ new (&data_[h_]) T(std::forward<O>(item));
832
+ weights_[h_] = weight;
833
+ if (marks_ != nullptr) {
834
+ marks_[h_] = mark;
835
+ }
836
+ ++h_;
837
+ num_marks_in_h_ += mark ? 1 : 0;
838
+
839
+ // check if need to heapify
840
+ if (h_ > k_) {
841
+ filled_data_ = true;
842
+ transition_from_warmup();
843
+ }
844
+ }
845
+
846
+ /* In the "light" case the new item has weight <= old_tau, so
847
+ would appear to the right of the R items in a hypothetical reverse-sorted
848
+ list. It is easy to prove that it is light enough to be part of this
849
+ round's downsampling */
850
+ template<typename T, typename S, typename A>
851
+ template<typename O>
852
+ void var_opt_sketch<T,S,A>::update_light(O&& item, double weight, bool mark) {
853
+ if (r_ == 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during light warmup");
854
+
855
+ const uint32_t m_slot = h_; // index of the gap, which becomes the M region
856
+ if (filled_data_) {
857
+ data_[m_slot] = std::forward<O>(item);
858
+ } else {
859
+ new (&data_[m_slot]) T(std::forward<O>(item));
860
+ filled_data_ = true;
861
+ }
862
+ weights_[m_slot] = weight;
863
+ if (marks_ != nullptr) { marks_[m_slot] = mark; }
864
+ ++m_;
865
+
866
+ grow_candidate_set(total_wt_r_ + weight, r_ + 1);
867
+ }
868
+
869
+ /* In the "heavy" case the new item has weight > old_tau, so would
870
+ appear to the left of items in R in a hypothetical reverse-sorted list and
871
+ might or might not be light enough be part of this round's downsampling.
872
+ [After first splitting off the R=1 case] we greatly simplify the code by
873
+ putting the new item into the H heap whether it needs to be there or not.
874
+ In other words, it might go into the heap and then come right back out,
875
+ but that should be okay because pseudo_heavy items cannot predominate
876
+ in long streams unless (max wt) / (min wt) > o(exp(N)) */
877
+ template<typename T, typename S, typename A>
878
+ template<typename O>
879
+ void var_opt_sketch<T,S,A>::update_heavy_general(O&& item, double weight, bool mark) {
880
+ if (r_ < 2 || m_ != 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during heavy general update");
881
+
882
+ // put into H, although may come back out momentarily
883
+ push(std::forward<O>(item), weight, mark);
884
+
885
+ grow_candidate_set(total_wt_r_, r_);
886
+ }
887
+
888
+ /* The analysis of this case is similar to that of the general heavy case.
889
+ The one small technical difference is that since R < 2, we must grab an M item
890
+ to have a valid starting point for continue_by_growing_candidate_set () */
891
+ template<typename T, typename S, typename A>
892
+ template<typename O>
893
+ void var_opt_sketch<T,S,A>::update_heavy_r_eq1(O&& item, double weight, bool mark) {
894
+ if (r_ != 1 || m_ != 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during heavy r=1 update");
895
+
896
+ push(std::forward<O>(item), weight, mark); // new item into H
897
+ pop_min_to_m_region(); // pop lightest back into M
898
+
899
+ // Any set of two items is downsample-able to one item,
900
+ // so the two lightest items are a valid starting point for the following
901
+ const uint32_t m_slot = k_ - 1; // array is k+1, 1 in R, so slot before is M
902
+ grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
903
+ }
904
+
905
+ /**
906
+ * Decreases sketch's value of k by 1, updating stored values as needed.
907
+ *
908
+ * <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by
909
+ * the unioning algorithm to force "marked" items out of H and into the reservoir region.</p>
910
+ */
911
+ template<typename T, typename S, typename A>
912
+ void var_opt_sketch<T,S,A>::decrease_k_by_1() {
913
+ if (k_ <= 1) {
914
+ throw std::logic_error("Cannot decrease k below 1 in union");
915
+ }
916
+
917
+ if ((h_ == 0) && (r_ == 0)) {
918
+ // exact mode, but no data yet; this reduction is somewhat gratuitous
919
+ --k_;
920
+ } else if ((h_ > 0) && (r_ == 0)) {
921
+ // exact mode, but we have some data
922
+ --k_;
923
+ if (h_ > k_) {
924
+ transition_from_warmup();
925
+ }
926
+ } else if ((h_ > 0) && (r_ > 0)) {
927
+ // reservoir mode, but we have some exact samples.
928
+ // Our strategy will be to pull an item out of H (which we are allowed to do since it's
929
+ // still just data), reduce k, and then re-insert the item
930
+
931
+ // first, slide the R zone to the left by 1, temporarily filling the gap
932
+ const uint32_t old_gap_idx = h_;
933
+ const uint32_t old_final_r_idx = (h_ + 1 + r_) - 1;
934
+ //if (old_final_r_idx != k_) throw std::logic_error("gadget in invalid state");
935
+
936
+ swap_values(old_final_r_idx, old_gap_idx);
937
+
938
+ // now we pull an item out of H; any item is ok, but if we grab the rightmost and then
939
+ // reduce h_, the heap invariant will be preserved (and the gap will be restored), plus
940
+ // the push() of the item that will probably happen later will be cheap.
941
+
942
+ const uint32_t pulled_idx = h_ - 1;
943
+ double pulled_weight = weights_[pulled_idx];
944
+ bool pulled_mark = marks_[pulled_idx];
945
+ // will move the pulled item below; don't do antying to it here
946
+
947
+ if (pulled_mark) { --num_marks_in_h_; }
948
+ weights_[pulled_idx] = -1.0; // to make bugs easier to spot
949
+
950
+ --h_;
951
+ --k_;
952
+ --n_; // will be re-incremented with the update
953
+
954
+ update(std::move(data_[pulled_idx]), pulled_weight, pulled_mark);
955
+ } else if ((h_ == 0) && (r_ > 0)) {
956
+ // pure reservoir mode, so can simply eject a randomly chosen sample from the reservoir
957
+ if (r_ < 2) throw std::logic_error("r_ too small for pure reservoir mode");
958
+
959
+ const uint32_t r_idx_to_delete = 1 + next_int(r_); // 1 for the gap
960
+ const uint32_t rightmost_r_idx = (1 + r_) - 1;
961
+ swap_values(r_idx_to_delete, rightmost_r_idx);
962
+ weights_[rightmost_r_idx] = -1.0;
963
+
964
+ --k_;
965
+ --r_;
966
+ }
967
+ }
968
+
969
+ template<typename T, typename S, typename A>
970
+ void var_opt_sketch<T,S,A>::allocate_data_arrays(uint32_t tgt_size, bool use_marks) {
971
+ filled_data_ = false;
972
+
973
+ data_ = A().allocate(tgt_size);
974
+ weights_ = AllocDouble().allocate(tgt_size);
975
+
976
+ if (use_marks) {
977
+ marks_ = AllocBool().allocate(tgt_size);
978
+ } else {
979
+ marks_ = nullptr;
980
+ }
981
+ }
982
+
983
+ template<typename T, typename S, typename A>
984
+ void var_opt_sketch<T,S,A>::grow_data_arrays() {
985
+ const uint32_t prev_size = curr_items_alloc_;
986
+ curr_items_alloc_ = get_adjusted_size(k_, curr_items_alloc_ << rf_);
987
+ if (curr_items_alloc_ == k_) {
988
+ ++curr_items_alloc_;
989
+ }
990
+
991
+ if (prev_size < curr_items_alloc_) {
992
+ filled_data_ = false;
993
+
994
+ T* tmp_data = A().allocate(curr_items_alloc_);
995
+ double* tmp_weights = AllocDouble().allocate(curr_items_alloc_);
996
+
997
+ for (uint32_t i = 0; i < prev_size; ++i) {
998
+ new (&tmp_data[i]) T(std::move(data_[i]));
999
+ A().destroy(data_ + i);
1000
+ tmp_weights[i] = weights_[i];
1001
+ }
1002
+
1003
+ A().deallocate(data_, prev_size);
1004
+ AllocDouble().deallocate(weights_, prev_size);
1005
+
1006
+ data_ = tmp_data;
1007
+ weights_ = tmp_weights;
1008
+
1009
+ if (marks_ != nullptr) {
1010
+ bool* tmp_marks = AllocBool().allocate(curr_items_alloc_);
1011
+ for (uint32_t i = 0; i < prev_size; ++i) {
1012
+ tmp_marks[i] = marks_[i];
1013
+ }
1014
+ AllocBool().deallocate(marks_, prev_size);
1015
+ marks_ = tmp_marks;
1016
+ }
1017
+ }
1018
+ }
1019
+
1020
+ template<typename T, typename S, typename A>
1021
+ void var_opt_sketch<T,S,A>::transition_from_warmup() {
1022
+ // Move the 2 lightest items from H to M
1023
+ // But the lighter really belongs in R, so update counts to reflect that
1024
+ convert_to_heap();
1025
+ pop_min_to_m_region();
1026
+ pop_min_to_m_region();
1027
+ --m_;
1028
+ ++r_;
1029
+
1030
+ if (h_ != (k_ -1) || m_ != 1 || r_ != 1)
1031
+ throw std::logic_error("invalid state for transitioning from warmup");
1032
+
1033
+ // Update total weight in R and then, having grabbed the value, overwrite
1034
+ // in weight_ array to help make bugs more obvious
1035
+ total_wt_r_ = weights_[k_]; // only one item, known location
1036
+ weights_[k_] = -1.0;
1037
+
1038
+ // The two lightest items are ncessarily downsample-able to one item,
1039
+ // and are therefore a valid initial candidate set
1040
+ grow_candidate_set(weights_[k_ - 1] + total_wt_r_, 2);
1041
+ }
1042
+
1043
+ template<typename T, typename S, typename A>
1044
+ void var_opt_sketch<T,S,A>::convert_to_heap() {
1045
+ if (h_ < 2) {
1046
+ return; // nothing to do
1047
+ }
1048
+
1049
+ const uint32_t last_slot = h_ - 1;
1050
+ const int last_non_leaf = ((last_slot + 1) / 2) - 1;
1051
+
1052
+ for (int j = last_non_leaf; j >= 0; --j) {
1053
+ restore_towards_leaves(j);
1054
+ }
1055
+
1056
+ // validates heap, used for initial debugging
1057
+ //for (uint32_t j = h_ - 1; j >= 1; --j) {
1058
+ // uint32_t p = ((j + 1) / 2) - 1;
1059
+ // if (weights_[p] > weights_[j]) throw std::logic_error("invalid heap");
1060
+ //}
1061
+ }
1062
+
1063
+ template<typename T, typename S, typename A>
1064
+ void var_opt_sketch<T,S,A>::restore_towards_leaves(uint32_t slot_in) {
1065
+ const uint32_t last_slot = h_ - 1;
1066
+ if (h_ == 0 || slot_in > last_slot) throw std::logic_error("invalid heap state");
1067
+
1068
+ uint32_t slot = slot_in;
1069
+ uint32_t child = (2 * slot_in) + 1; // might be invalid, need to check
1070
+
1071
+ while (child <= last_slot) {
1072
+ uint32_t child2 = child + 1; // might also be invalid
1073
+ if ((child2 <= last_slot) && (weights_[child2] < weights_[child])) {
1074
+ // siwtch to other child if it's both valid and smaller
1075
+ child = child2;
1076
+ }
1077
+
1078
+ if (weights_[slot] <= weights_[child]) {
1079
+ // invariant holds so we're done
1080
+ break;
1081
+ }
1082
+
1083
+ // swap and continue
1084
+ swap_values(slot, child);
1085
+
1086
+ slot = child;
1087
+ child = (2 * slot) + 1; // might be invalid, checked on next loop
1088
+ }
1089
+ }
1090
+
1091
+ template<typename T, typename S, typename A>
1092
+ void var_opt_sketch<T,S,A>::restore_towards_root(uint32_t slot_in) {
1093
+ uint32_t slot = slot_in;
1094
+ uint32_t p = (((slot + 1) / 2) - 1); // valid if slot >= 1
1095
+ while ((slot > 0) && (weights_[slot] < weights_[p])) {
1096
+ swap_values(slot, p);
1097
+ slot = p;
1098
+ p = (((slot + 1) / 2) - 1); // valid if slot >= 1
1099
+ }
1100
+ }
1101
+
1102
+ template<typename T, typename S, typename A>
1103
+ template<typename O>
1104
+ void var_opt_sketch<T,S,A>::push(O&& item, double wt, bool mark) {
1105
+ if (filled_data_) {
1106
+ data_[h_] = std::forward<O>(item);
1107
+ } else {
1108
+ new (&data_[h_]) T(std::forward<O>(item));
1109
+ filled_data_ = true;
1110
+ }
1111
+ weights_[h_] = wt;
1112
+ if (marks_ != nullptr) {
1113
+ marks_[h_] = mark;
1114
+ num_marks_in_h_ += (mark ? 1 : 0);
1115
+ }
1116
+ ++h_;
1117
+
1118
+ restore_towards_root(h_ - 1); // need use old h_, but want accurate h_
1119
+ }
1120
+
1121
+ template<typename T, typename S, typename A>
1122
+ void var_opt_sketch<T,S,A>::pop_min_to_m_region() {
1123
+ if (h_ == 0 || (h_ + m_ + r_ != k_ + 1))
1124
+ throw std::logic_error("invalid heap state popping min to M region");
1125
+
1126
+ if (h_ == 1) {
1127
+ // just update bookkeeping
1128
+ ++m_;
1129
+ --h_;
1130
+ } else {
1131
+ // main case
1132
+ uint32_t tgt = h_ - 1; // last slot, will swap with root
1133
+ swap_values(0, tgt);
1134
+ ++m_;
1135
+ --h_;
1136
+
1137
+ restore_towards_leaves(0);
1138
+ }
1139
+
1140
+ if (is_marked(h_)) {
1141
+ --num_marks_in_h_;
1142
+ }
1143
+ }
1144
+
1145
+
1146
+ template<typename T, typename S, typename A>
1147
+ void var_opt_sketch<T,S,A>::swap_values(uint32_t src, uint32_t dst) {
1148
+ std::swap(data_[src], data_[dst]);
1149
+ std::swap(weights_[src], weights_[dst]);
1150
+
1151
+ if (marks_ != nullptr) {
1152
+ std::swap(marks_[src], marks_[dst]);
1153
+ }
1154
+ }
1155
+
1156
+ /* When entering here we should be in a well-characterized state where the
1157
+ new item has been placed in either h or m and we have a valid but not necessarily
1158
+ maximal sampling plan figured out. The array is completely full at this point.
1159
+ Everyone in h and m has an explicit weight. The candidates are right-justified
1160
+ and are either just the r set or the r set + exactly one m item. The number
1161
+ of cands is at least 2. We will now grow the candidate set as much as possible
1162
+ by pulling sufficiently light items from h to m.
1163
+ */
1164
+ template<typename T, typename S, typename A>
1165
+ void var_opt_sketch<T,S,A>::grow_candidate_set(double wt_cands, uint32_t num_cands) {
1166
+ if ((h_ + m_ + r_ != k_ + 1) || (num_cands < 1) || (num_cands != m_ + r_) || (m_ >= 2))
1167
+ throw std::logic_error("invariant violated when growing candidate set");
1168
+
1169
+ while (h_ > 0) {
1170
+ const double next_wt = peek_min();
1171
+ const double next_tot_wt = wt_cands + next_wt;
1172
+
1173
+ // test for strict lightness of next prospect (denominator multiplied through)
1174
+ // ideally: (next_wt * (next_num_cands-1) < next_tot_wt)
1175
+ // but can use num_cands directly
1176
+ if ((next_wt * num_cands) < next_tot_wt) {
1177
+ wt_cands = next_tot_wt;
1178
+ ++num_cands;
1179
+ pop_min_to_m_region(); // adjusts h_ and m_
1180
+ } else {
1181
+ break;
1182
+ }
1183
+ }
1184
+
1185
+ downsample_candidate_set(wt_cands, num_cands);
1186
+ }
1187
+
1188
+ template<typename T, typename S, typename A>
1189
+ void var_opt_sketch<T,S,A>::downsample_candidate_set(double wt_cands, uint32_t num_cands) {
1190
+ if (num_cands < 2 || h_ + num_cands != k_ + 1)
1191
+ throw std::logic_error("invalid num_cands when downsampling");
1192
+
1193
+ // need this before overwriting anything
1194
+ const uint32_t delete_slot = choose_delete_slot(wt_cands, num_cands);
1195
+ const uint32_t leftmost_cand_slot = h_;
1196
+ if (delete_slot < leftmost_cand_slot || delete_slot > k_)
1197
+ throw std::logic_error("invalid delete slot index when downsampling");
1198
+
1199
+ // Overwrite weights for items from M moving into R,
1200
+ // to make bugs more obvious. Also needed so anyone reading the
1201
+ // weight knows if it's invalid without checking h_ and m_
1202
+ const uint32_t stop_idx = leftmost_cand_slot + m_;
1203
+ for (uint32_t j = leftmost_cand_slot; j < stop_idx; ++j) {
1204
+ weights_[j] = -1.0;
1205
+ }
1206
+
1207
+ // The next two lines work even when delete_slot == leftmost_cand_slot
1208
+ data_[delete_slot] = std::move(data_[leftmost_cand_slot]);
1209
+ // cannot set data_[leftmost_cand_slot] to null since not uisng T*
1210
+
1211
+ m_ = 0;
1212
+ r_ = num_cands - 1;
1213
+ total_wt_r_ = wt_cands;
1214
+ }
1215
+
1216
+ template<typename T, typename S, typename A>
1217
+ uint32_t var_opt_sketch<T,S,A>::choose_delete_slot(double wt_cands, uint32_t num_cands) const {
1218
+ if (r_ == 0) throw std::logic_error("choosing delete slot while in exact mode");
1219
+
1220
+ if (m_ == 0) {
1221
+ // this happens if we insert a really heavy item
1222
+ return pick_random_slot_in_r();
1223
+ } else if (m_ == 1) {
1224
+ // check if we keep th item in M or pick oen from R
1225
+ // p(keep) = (num_cand - 1) * wt_M / wt_cand
1226
+ double wt_m_cand = weights_[h_]; // slot of item in M is h_
1227
+ if ((wt_cands * next_double_exclude_zero()) < ((num_cands - 1) * wt_m_cand)) {
1228
+ return pick_random_slot_in_r(); // keep item in M
1229
+ } else {
1230
+ return h_; // indext of item in M
1231
+ }
1232
+ } else {
1233
+ // general case
1234
+ const uint32_t delete_slot = choose_weighted_delete_slot(wt_cands, num_cands);
1235
+ const uint32_t first_r_slot = h_ + m_;
1236
+ if (delete_slot == first_r_slot) {
1237
+ return pick_random_slot_in_r();
1238
+ } else {
1239
+ return delete_slot;
1240
+ }
1241
+ }
1242
+ }
1243
+
1244
+ template<typename T, typename S, typename A>
1245
+ uint32_t var_opt_sketch<T,S,A>::choose_weighted_delete_slot(double wt_cands, uint32_t num_cands) const {
1246
+ if (m_ < 1) throw std::logic_error("must have weighted delete slot");
1247
+
1248
+ const uint32_t offset = h_;
1249
+ const uint32_t final_m = (offset + m_) - 1;
1250
+ const uint32_t num_to_keep = num_cands - 1;
1251
+
1252
+ double left_subtotal = 0.0;
1253
+ double right_subtotal = -1.0 * wt_cands * next_double_exclude_zero();
1254
+
1255
+ for (uint32_t i = offset; i <= final_m; ++i) {
1256
+ left_subtotal += num_to_keep * weights_[i];
1257
+ right_subtotal += wt_cands;
1258
+
1259
+ if (left_subtotal < right_subtotal) {
1260
+ return i;
1261
+ }
1262
+ }
1263
+
1264
+ // this slot tells caller that we need to delete out of R
1265
+ return final_m + 1;
1266
+ }
1267
+
1268
+ template<typename T, typename S, typename A>
1269
+ uint32_t var_opt_sketch<T,S,A>::pick_random_slot_in_r() const {
1270
+ if (r_ == 0) throw std::logic_error("r_ = 0 when picking slot in R region");
1271
+ const uint32_t offset = h_ + m_;
1272
+ if (r_ == 1) {
1273
+ return offset;
1274
+ } else {
1275
+ return offset + next_int(r_);
1276
+ }
1277
+ }
1278
+
1279
+ template<typename T, typename S, typename A>
1280
+ double var_opt_sketch<T,S,A>::peek_min() const {
1281
+ if (h_ == 0) throw std::logic_error("h_ = 0 when checking min in H region");
1282
+ return weights_[0];
1283
+ }
1284
+
1285
+ template<typename T, typename S, typename A>
1286
+ inline bool var_opt_sketch<T,S,A>::is_marked(uint32_t idx) const {
1287
+ return marks_ == nullptr ? false : marks_[idx];
1288
+ }
1289
+
1290
+ template<typename T, typename S, typename A>
1291
+ double var_opt_sketch<T,S,A>::get_tau() const {
1292
+ return r_ == 0 ? std::nan("1") : (total_wt_r_ / r_);
1293
+ }
1294
+
1295
+ template<typename T, typename S, typename A>
1296
+ void var_opt_sketch<T,S,A>::strip_marks() {
1297
+ if (marks_ == nullptr) throw std::logic_error("request to strip marks from non-gadget");
1298
+ num_marks_in_h_ = 0;
1299
+ AllocBool().deallocate(marks_, curr_items_alloc_);
1300
+ marks_ = nullptr;
1301
+ }
1302
+
1303
+ template<typename T, typename S, typename A>
1304
+ void var_opt_sketch<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
1305
+ const bool is_empty(flags & EMPTY_FLAG_MASK);
1306
+
1307
+ if (is_empty) {
1308
+ if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
1309
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
1310
+ + std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
1311
+ + std::to_string(preamble_longs));
1312
+ }
1313
+ } else {
1314
+ if (preamble_longs != PREAMBLE_LONGS_WARMUP
1315
+ && preamble_longs != PREAMBLE_LONGS_FULL) {
1316
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
1317
+ + std::to_string(PREAMBLE_LONGS_WARMUP) + " or "
1318
+ + std::to_string(PREAMBLE_LONGS_FULL)
1319
+ + " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
1320
+ }
1321
+ }
1322
+ }
1323
+
1324
+ template<typename T, typename S, typename A>
1325
+ void var_opt_sketch<T,S,A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
1326
+ if (family_id == FAMILY_ID) {
1327
+ if (ser_ver != SER_VER) {
1328
+ throw std::invalid_argument("Possible corruption: VarOpt serialization version must be "
1329
+ + std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));
1330
+ }
1331
+ return;
1332
+ }
1333
+ // TODO: extend to handle reservoir sampling
1334
+
1335
+ throw std::invalid_argument("Possible corruption: VarOpt family id must be "
1336
+ + std::to_string(FAMILY_ID) + ". Found: " + std::to_string(family_id));
1337
+ }
1338
+
1339
+ template<typename T, typename S, typename A>
1340
+ uint32_t var_opt_sketch<T, S, A>::validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
1341
+ uint32_t h, uint32_t r, resize_factor rf) {
1342
+ if (k == 0 || k > MAX_K) {
1343
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
1344
+ }
1345
+
1346
+ uint32_t array_size;
1347
+
1348
+ if (n <= k) {
1349
+ if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
1350
+ throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
1351
+ "Found n = " + std::to_string(n) + ", k = " + std::to_string(k));
1352
+ }
1353
+ if (n != h) {
1354
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
1355
+ "Found n = " + std::to_string(n) + ", h = " + std::to_string(h));
1356
+ }
1357
+ if (r > 0) {
1358
+ throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
1359
+ "Found r = " + std::to_string(r));
1360
+ }
1361
+
1362
+ const uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k));
1363
+ const uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h));
1364
+ const uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf, min_lg_size);
1365
+ array_size = get_adjusted_size(k, 1 << initial_lg_size);
1366
+ if (array_size == k) { // if full size, need to leave 1 for the gap
1367
+ ++array_size;
1368
+ }
1369
+ } else { // n > k
1370
+ if (preamble_longs != PREAMBLE_LONGS_FULL) {
1371
+ throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
1372
+ "Found n = " + std::to_string(n) + ", k = " + std::to_string(k));
1373
+ }
1374
+ if (h + r != k) {
1375
+ throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
1376
+ "Found h = " + std::to_string(h) + ", r = " + std::to_string(r) + ", n = " + std::to_string(n));
1377
+ }
1378
+
1379
+ array_size = k + 1;
1380
+ }
1381
+
1382
+ return array_size;
1383
+ }
1384
+
1385
+ template<typename T, typename S, typename A>
1386
+ template<typename P>
1387
+ subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(P predicate) const {
1388
+ if (n_ == 0) {
1389
+ return {0.0, 0.0, 0.0, 0.0};
1390
+ }
1391
+
1392
+ double total_wt_h = 0.0;
1393
+ double h_true_wt = 0.0;
1394
+ size_t idx = 0;
1395
+ for (; idx < h_; ++idx) {
1396
+ double wt = weights_[idx];
1397
+ total_wt_h += wt;
1398
+ if (predicate(data_[idx])) {
1399
+ h_true_wt += wt;
1400
+ }
1401
+ }
1402
+
1403
+ // if only heavy items, we have an exact answer
1404
+ if (r_ == 0) {
1405
+ return {h_true_wt, h_true_wt, h_true_wt, h_true_wt};
1406
+ }
1407
+
1408
+ // since r_ > 0, we know we have samples
1409
+ const uint64_t num_samples = n_ - h_;
1410
+ double effective_sampling_rate = r_ / static_cast<double>(num_samples);
1411
+ if (effective_sampling_rate < 0.0 || effective_sampling_rate > 1.0)
1412
+ throw std::logic_error("invalid sampling rate outside [0.0, 1.0]");
1413
+
1414
+ size_t r_true_count = 0;
1415
+ ++idx; // skip the gap
1416
+ for (; idx < (k_ + 1); ++idx) {
1417
+ if (predicate(data_[idx])) {
1418
+ ++r_true_count;
1419
+ }
1420
+ }
1421
+
1422
+ double lb_true_fraction = pseudo_hypergeometric_lb_on_p(r_, r_true_count, effective_sampling_rate);
1423
+ double estimated_true_fraction = (1.0 * r_true_count) / r_;
1424
+ double ub_true_fraction = pseudo_hypergeometric_ub_on_p(r_, r_true_count, effective_sampling_rate);
1425
+
1426
+ return { h_true_wt + (total_wt_r_ * lb_true_fraction),
1427
+ h_true_wt + (total_wt_r_ * estimated_true_fraction),
1428
+ h_true_wt + (total_wt_r_ * ub_true_fraction),
1429
+ total_wt_h + total_wt_r_
1430
+ };
1431
+ }
1432
+
1433
+ template<typename T, typename S, typename A>
1434
+ class var_opt_sketch<T, S, A>::items_deleter {
1435
+ public:
1436
+ items_deleter(uint32_t num) : num(num), h_count(0), r_count(0) {}
1437
+ void set_h(uint32_t h) { h_count = h; }
1438
+ void set_r(uint32_t r) { r_count = r; }
1439
+ void operator() (T* ptr) const {
1440
+ if (h_count > 0) {
1441
+ for (size_t i = 0; i < h_count; ++i) {
1442
+ ptr[i].~T();
1443
+ }
1444
+ }
1445
+ if (r_count > 0) {
1446
+ uint32_t end = h_count + r_count + 1;
1447
+ for (size_t i = h_count + 1; i < end; ++i) {
1448
+ ptr[i].~T();
1449
+ }
1450
+ }
1451
+ if (ptr != nullptr) {
1452
+ A().deallocate(ptr, num);
1453
+ }
1454
+ }
1455
+ private:
1456
+ uint32_t num;
1457
+ uint32_t h_count;
1458
+ uint32_t r_count;
1459
+ };
1460
+
1461
+ template<typename T, typename S, typename A>
1462
+ class var_opt_sketch<T, S, A>::weights_deleter {
1463
+ public:
1464
+ weights_deleter(uint32_t num) : num(num) {}
1465
+ void operator() (double* ptr) const {
1466
+ if (ptr != nullptr) {
1467
+ AllocDouble().deallocate(ptr, num);
1468
+ }
1469
+ }
1470
+ private:
1471
+ uint32_t num;
1472
+ };
1473
+
1474
+ template<typename T, typename S, typename A>
1475
+ class var_opt_sketch<T, S, A>::marks_deleter {
1476
+ public:
1477
+ marks_deleter(uint32_t num) : num(num) {}
1478
+ void operator() (bool* ptr) const {
1479
+ if (ptr != nullptr) {
1480
+ AllocBool().deallocate(ptr, 1);
1481
+ }
1482
+ }
1483
+ private:
1484
+ uint32_t num;
1485
+ };
1486
+
1487
+
1488
+ template<typename T, typename S, typename A>
1489
+ typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::begin() const {
1490
+ return var_opt_sketch<T, S, A>::const_iterator(*this, false);
1491
+ }
1492
+
1493
+ template<typename T, typename S, typename A>
1494
+ typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::end() const {
1495
+ return var_opt_sketch<T, S, A>::const_iterator(*this, true);
1496
+ }
1497
+
1498
+ // -------- var_opt_sketch::const_iterator implementation ---------
1499
+
1500
+ template<typename T, typename S, typename A>
1501
+ var_opt_sketch<T,S,A>::const_iterator::const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end) :
1502
+ sk_(&sk),
1503
+ cum_r_weight_(0.0),
1504
+ r_item_wt_(sk.get_tau()),
1505
+ final_idx_(sk.r_ > 0 ? sk.h_ + sk.r_ + 1 : sk.h_)
1506
+ {
1507
+ // index logic easier to read if not inline
1508
+ if (is_end) {
1509
+ idx_ = final_idx_;
1510
+ sk_ = nullptr;
1511
+ } else {
1512
+ idx_ = (sk.h_ == 0 && sk.r_ > 0 ? 1 : 0); // skip if gap is at start
1513
+ }
1514
+
1515
+ // should only apply if sketch is empty
1516
+ if (idx_ == final_idx_) { sk_ = nullptr; }
1517
+ }
1518
+
1519
+ template<typename T, typename S, typename A>
1520
+ var_opt_sketch<T,S,A>::const_iterator::const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region) :
1521
+ sk_(&sk),
1522
+ cum_r_weight_(0.0),
1523
+ r_item_wt_(sk.get_tau()),
1524
+ final_idx_(sk.h_ + (use_r_region ? 1 + sk.r_ : 0))
1525
+ {
1526
+ if (use_r_region) {
1527
+ idx_ = sk.h_ + 1 + (is_end ? sk.r_ : 0);
1528
+ } else { // H region
1529
+ // gap at start only if h_ == 0, so index always starts at 0
1530
+ idx_ = (is_end ? sk.h_ : 0);
1531
+ }
1532
+
1533
+ // unlike in full iterator case, may happen even if sketch is not empty
1534
+ if (idx_ == final_idx_) { sk_ = nullptr; }
1535
+ }
1536
+
1537
+
1538
+ template<typename T, typename S, typename A>
1539
+ var_opt_sketch<T, S, A>::const_iterator::const_iterator(const const_iterator& other) :
1540
+ sk_(other.sk_),
1541
+ cum_r_weight_(other.cum_r_weight_),
1542
+ r_item_wt_(other.r_item_wt_),
1543
+ idx_(other.idx_),
1544
+ final_idx_(other.final_idx_)
1545
+ {}
1546
+
1547
+ template<typename T, typename S, typename A>
1548
+ typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++() {
1549
+ ++idx_;
1550
+
1551
+ if (idx_ == final_idx_) {
1552
+ sk_ = nullptr;
1553
+ return *this;
1554
+ } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
1555
+ ++idx_;
1556
+ }
1557
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1558
+ return *this;
1559
+ }
1560
+
1561
+ template<typename T, typename S, typename A>
1562
+ typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++(int) {
1563
+ const_iterator tmp(*this);
1564
+ operator++();
1565
+ return tmp;
1566
+ }
1567
+
1568
+ template<typename T, typename S, typename A>
1569
+ bool var_opt_sketch<T, S, A>::const_iterator::operator==(const const_iterator& other) const {
1570
+ if (sk_ != other.sk_) return false;
1571
+ if (sk_ == nullptr) return true; // end (and we know other.sk_ is also null)
1572
+ return idx_ == other.idx_;
1573
+ }
1574
+
1575
+ template<typename T, typename S, typename A>
1576
+ bool var_opt_sketch<T, S, A>::const_iterator::operator!=(const const_iterator& other) const {
1577
+ return !operator==(other);
1578
+ }
1579
+
1580
+ template<typename T, typename S, typename A>
1581
+ const std::pair<const T&, const double> var_opt_sketch<T, S, A>::const_iterator::operator*() const {
1582
+ double wt;
1583
+ if (idx_ < sk_->h_) {
1584
+ wt = sk_->weights_[idx_];
1585
+ } else {
1586
+ wt = r_item_wt_;
1587
+ }
1588
+ return std::pair<const T&, const double>(sk_->data_[idx_], wt);
1589
+ }
1590
+
1591
+ template<typename T, typename S, typename A>
1592
+ bool var_opt_sketch<T, S, A>::const_iterator::get_mark() const {
1593
+ return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
1594
+ }
1595
+
1596
+
1597
+ // -------- var_opt_sketch::iterator implementation ---------
1598
+
1599
+ template<typename T, typename S, typename A>
1600
+ var_opt_sketch<T,S,A>::iterator::iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region) :
1601
+ sk_(&sk),
1602
+ cum_r_weight_(0.0),
1603
+ r_item_wt_(sk.get_tau()),
1604
+ final_idx_(sk.h_ + (use_r_region ? 1 + sk.r_ : 0))
1605
+ {
1606
+ if (use_r_region) {
1607
+ idx_ = sk.h_ + 1 + (is_end ? sk.r_ : 0);
1608
+ } else { // H region
1609
+ // gap at start only if h_ == 0, so index always starts at 0
1610
+ idx_ = (is_end ? sk.h_ : 0);
1611
+ }
1612
+
1613
+ // unlike in full iterator case, may happen even if sketch is not empty
1614
+ if (idx_ == final_idx_) { sk_ = nullptr; }
1615
+ }
1616
+
1617
+ template<typename T, typename S, typename A>
1618
+ var_opt_sketch<T, S, A>::iterator::iterator(const iterator& other) :
1619
+ sk_(other.sk_),
1620
+ cum_r_weight_(other.cum_r_weight_),
1621
+ r_item_wt_(other.r_item_wt_),
1622
+ idx_(other.idx_),
1623
+ final_idx_(other.final_idx_)
1624
+ {}
1625
+
1626
+ template<typename T, typename S, typename A>
1627
+ typename var_opt_sketch<T, S, A>::iterator& var_opt_sketch<T, S, A>::iterator::operator++() {
1628
+ ++idx_;
1629
+
1630
+ if (idx_ == final_idx_) {
1631
+ sk_ = nullptr;
1632
+ return *this;
1633
+ } else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
1634
+ ++idx_;
1635
+ }
1636
+ if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
1637
+ return *this;
1638
+ }
1639
+
1640
+ template<typename T, typename S, typename A>
1641
+ typename var_opt_sketch<T, S, A>::iterator& var_opt_sketch<T, S, A>::iterator::operator++(int) {
1642
+ const_iterator tmp(*this);
1643
+ operator++();
1644
+ return tmp;
1645
+ }
1646
+
1647
+ template<typename T, typename S, typename A>
1648
+ bool var_opt_sketch<T, S, A>::iterator::operator==(const iterator& other) const {
1649
+ if (sk_ != other.sk_) return false;
1650
+ if (sk_ == nullptr) return true; // end (and we know other.sk_ is also null)
1651
+ return idx_ == other.idx_;
1652
+ }
1653
+
1654
+ template<typename T, typename S, typename A>
1655
+ bool var_opt_sketch<T, S, A>::iterator::operator!=(const iterator& other) const {
1656
+ return !operator==(other);
1657
+ }
1658
+
1659
+ template<typename T, typename S, typename A>
1660
+ std::pair<T&, double> var_opt_sketch<T, S, A>::iterator::operator*() {
1661
+ double wt;
1662
+ if (idx_ < sk_->h_) {
1663
+ wt = sk_->weights_[idx_];
1664
+ } else if (idx_ == final_idx_ - 1) {
1665
+ wt = sk_->total_wt_r_ - cum_r_weight_;
1666
+ } else {
1667
+ wt = r_item_wt_;
1668
+ }
1669
+ return std::pair<T&, double>(sk_->data_[idx_], wt);
1670
+ }
1671
+
1672
+ template<typename T, typename S, typename A>
1673
+ bool var_opt_sketch<T, S, A>::iterator::get_mark() const {
1674
+ return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
1675
+ }
1676
+
1677
+
1678
+
1679
+ // ******************** MOVE TO COMMON UTILS AREA EVENTUALLY *********************
1680
+
1681
+ namespace random_utils {
1682
+ static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
1683
+ static std::mt19937_64 rand(rd());
1684
+ static std::uniform_real_distribution<> next_double(0.0, 1.0);
1685
+ }
1686
+
1687
+ /**
1688
+ * Checks if target sampling allocation is more than 50% of max sampling size.
1689
+ * If so, returns max sampling size, otherwise passes through target size.
1690
+ */
1691
+ template<typename T, typename S, typename A>
1692
+ uint32_t var_opt_sketch<T,S,A>::get_adjusted_size(uint32_t max_size, uint32_t resize_target) {
1693
+ if (max_size - (resize_target << 1) < 0L) {
1694
+ return max_size;
1695
+ }
1696
+ return resize_target;
1697
+ }
1698
+
1699
+ template<typename T, typename S, typename A>
1700
+ uint32_t var_opt_sketch<T,S,A>::starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min) {
1701
+ return (lg_target <= lg_min)
1702
+ ? lg_min : (lg_rf == 0) ? lg_target
1703
+ : (lg_target - lg_min) % lg_rf + lg_min;
1704
+ }
1705
+
1706
+ template<typename T, typename S, typename A>
1707
+ double var_opt_sketch<T,S,A>::pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate) {
1708
+ const double adjusted_kappa = DEFAULT_KAPPA * sqrt(1 - sampling_rate);
1709
+ return bounds_binomial_proportions::approximate_upper_bound_on_p(n, k, adjusted_kappa);
1710
+ }
1711
+
1712
+ template<typename T, typename S, typename A>
1713
+ double var_opt_sketch<T,S,A>::pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate) {
1714
+ const double adjusted_kappa = DEFAULT_KAPPA * sqrt(1 - sampling_rate);
1715
+ return bounds_binomial_proportions::approximate_lower_bound_on_p(n, k, adjusted_kappa);
1716
+ }
1717
+
1718
+ template<typename T, typename S, typename A>
1719
+ bool var_opt_sketch<T,S,A>::is_power_of_2(uint32_t v) {
1720
+ return v && !(v & (v - 1));
1721
+ }
1722
+
1723
+ template<typename T, typename S, typename A>
1724
+ uint32_t var_opt_sketch<T,S,A>::to_log_2(uint32_t v) {
1725
+ if (is_power_of_2(v)) {
1726
+ return count_trailing_zeros_in_u32(v);
1727
+ } else {
1728
+ throw std::invalid_argument("Attempt to compute integer log2 of non-positive or non-power of 2");
1729
+ }
1730
+ }
1731
+
1732
+ // Returns an integer in the range [0, max_value) -- excludes max_value
1733
+ template<typename T, typename S, typename A>
1734
+ uint32_t var_opt_sketch<T,S,A>::next_int(uint32_t max_value) {
1735
+ std::uniform_int_distribution<uint32_t> dist(0, max_value - 1);
1736
+ return dist(random_utils::rand);
1737
+ }
1738
+
1739
+ template<typename T, typename S, typename A>
1740
+ double var_opt_sketch<T,S,A>::next_double_exclude_zero() {
1741
+ double r = random_utils::next_double(random_utils::rand);
1742
+ while (r == 0.0) {
1743
+ r = random_utils::next_double(random_utils::rand);
1744
+ }
1745
+ return r;
1746
+ }
1747
+
1748
+ }
1749
+
1750
+ // namespace datasketches
1751
+
1752
+ #endif // _VAR_OPT_SKETCH_IMPL_HPP_