datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,239 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _VAR_OPT_UNION_HPP_
21
+ #define _VAR_OPT_UNION_HPP_
22
+
23
+ #include "var_opt_sketch.hpp"
24
+ #include "common_defs.hpp"
25
+ #include "serde.hpp"
26
+
27
+ #include <vector>
28
+
29
+ namespace datasketches {
30
+
31
+ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32
+
33
+ /**
34
+ * Provides a unioning operation over var_opt_sketch objects. This union allows
35
+ * the sample size k to float, possibly increasing or decreasing as warranted by
36
+ * the available data.
37
+ *
38
+ * The union currently allows serialization and deserialization, even though transporting
39
+ * union objects seems to be an anti-pattern with most sketches. We currently provide it here
40
+ * because the get_result() call may need to discard samples and decrease k in order to
41
+ * return a valid sketch, even if future calls to update() would allow k to remain larger.
42
+ *
43
+ * The (de)serialization methods may be deprecated and subsequently removed in future versions.
44
+ *
45
+ * author Kevin Lang
46
+ * author Jon Malkin
47
+ */
48
+ template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
49
+ class var_opt_union {
50
+
51
+ public:
52
+ static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
53
+
54
+ explicit var_opt_union(uint32_t max_k);
55
+ var_opt_union(const var_opt_union& other);
56
+ var_opt_union(var_opt_union&& other) noexcept;
57
+
58
+ ~var_opt_union();
59
+
60
+ var_opt_union& operator=(const var_opt_union& other);
61
+ var_opt_union& operator=(var_opt_union&& other);
62
+
63
+ /**
64
+ * Updates this union with the given sketch
65
+ * This method takes an lvalue.
66
+ * @param sk a sketch to add to the union
67
+ */
68
+ void update(const var_opt_sketch<T,S,A>& sk);
69
+
70
+ /**
71
+ * Updates this union with the given sketch
72
+ * This method takes an rvalue.
73
+ * @param sk a sketch to add to the union
74
+ */
75
+ void update(var_opt_sketch<T,S,A>&& sk);
76
+
77
+ /**
78
+ * Gets the varopt sketch resulting from the union of any input sketches.
79
+ * @return a varopt sketch
80
+ */
81
+ var_opt_sketch<T,S,A> get_result() const;
82
+
83
+ /**
84
+ * Resets the union to its default, empty state.
85
+ */
86
+ void reset();
87
+
88
+ /**
89
+ * Computes size needed to serialize the current state of the union.
90
+ * This version is for all other types and can be expensive since every item needs to be looked at.
91
+ * @return size in bytes needed to serialize this sketch
92
+ */
93
+ size_t get_serialized_size_bytes() const;
94
+
95
+ // This is a convenience alias for users
96
+ // The type returned by the following serialize method
97
+ typedef vector_u8<A> vector_bytes;
98
+
99
+ /**
100
+ * NOTE: This method may be deprecated in a future version.
101
+ * This method serializes the sketch as a vector of bytes.
102
+ * An optional header can be reserved in front of the sketch.
103
+ * It is a blank space of a given size.
104
+ * This header is used in Datasketches PostgreSQL extension.
105
+ * @param header_size_bytes space to reserve in front of the sketch
106
+ */
107
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
108
+
109
+ /**
110
+ * NOTE: This method may be deprecated in a future version.
111
+ * This method serializes the sketch into a given stream in a binary form
112
+ * @param os output stream
113
+ */
114
+ void serialize(std::ostream& os) const;
115
+
116
+ /**
117
+ * NOTE: This method may be deprecated in a future version.
118
+ * This method deserializes a union from a given stream.
119
+ * @param is input stream
120
+ * @return an instance of a union
121
+ */
122
+ static var_opt_union deserialize(std::istream& is);
123
+
124
+ /**
125
+ * NOTE: This method may be deprecated in a future version.
126
+ * This method deserializes a skeuniontch from a given array of bytes.
127
+ * @param bytes pointer to the array of bytes
128
+ * @param size the size of the array
129
+ * @return an instance of a union
130
+ */
131
+ static var_opt_union deserialize(const void* bytes, size_t size);
132
+
133
+ /**
134
+ * Prints a summary of the union as a string.
135
+ * @return the summary as a string
136
+ */
137
+ string<A> to_string() const;
138
+
139
+
140
+ private:
141
+ typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T,S,A>> AllocSketch;
142
+
143
+ static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
144
+ static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
145
+ static const uint8_t SER_VER = 2;
146
+ static const uint8_t FAMILY_ID = 14;
147
+ static const uint8_t EMPTY_FLAG_MASK = 4;
148
+
149
+ uint64_t n_; // cumulative over all input sketches
150
+
151
+ // outer tau is the largest tau of any input sketch
152
+ double outer_tau_numer_; // total weight of all input R-zones where tau = outer_tau
153
+
154
+ // total cardinality of the same R-zones, or zero if no input sketch was in estimation mode
155
+ uint64_t outer_tau_denom_;
156
+
157
+ uint32_t max_k_;
158
+
159
+ var_opt_sketch<T,S,A> gadget_;
160
+
161
+ var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
162
+ uint32_t max_k, var_opt_sketch<T,S,A>&& gadget);
163
+
164
+ /*
165
+ IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
166
+ but in fact is NOT because it doesn't satisfy the mathematical definition
167
+ of a varopt sketch of the concatenated input streams. Therefore it could be different
168
+ from a true varopt sketch with that value of K, in which case it could easily provide
169
+ worse estimation accuracy for subset-sum queries.
170
+
171
+ This should not surprise you; the approximation guarantees of varopt sketches
172
+ do not apply to things that merely resemble varopt sketches.
173
+
174
+ However, even though the gadget is not a varopt sketch, the result
175
+ of the unioning process IS a varopt sketch. It is constructed by a
176
+ somewhat complicated "resolution" process which determines the largest K
177
+ that a valid varopt sketch could have given the available information,
178
+ then constructs a varopt sketch of that size and returns it.
179
+
180
+ However, the gadget itself is not touched during the resolution process,
181
+ and additional sketches could subsequently be merged into the union,
182
+ at which point a varopt result could again be requested.
183
+ */
184
+
185
+ /*
186
+ Explanation of "marked items" in the union's gadget:
187
+
188
+ The boolean value "true" in an pair indicates that the item
189
+ came from an input sketch's R zone, so it is already the result of sampling.
190
+
191
+ Therefore it must not wind up in the H zone of the final result, because
192
+ that would imply that the item is "exact".
193
+
194
+ However, it is okay for a marked item to hang out in the gadget's H zone for a while.
195
+
196
+ And once the item has moved to the gadget's R zone, the mark is never checked again,
197
+ so no effort is made to ensure that its value is preserved or even makes sense.
198
+ */
199
+
200
+ /*
201
+ Note: if the computer could perform exact real-valued arithmetic, the union could finalize
202
+ its result by reducing k until inner_tau > outer_tau. [Due to the vagaries of floating point
203
+ arithmetic, we won't attempt to detect and specially handle the inner_tau = outer_tau special
204
+ case.]
205
+
206
+ In fact, we won't even look at tau while while reducing k. Instead the logic will be based
207
+ on the more robust integer quantity num_marks_in_h_ in the gadget. It is conceivable that due
208
+ to round-off error we could end up with inner_tau slightly less than outer_tau, but that should
209
+ be fairly harmless since we will have achieved our goal of getting the marked items out of H.
210
+
211
+ Also, you might be wondering why we are bothering to maintain the numerator and denominator
212
+ separately instead of just having a single variable outer_tau. This allows us (in certain
213
+ cases) to add an input's entire R-zone weight into the result sketch, as opposed to subdividing
214
+ it then adding it back up. That would be a source of numerical inaccuracy. And even
215
+ more importantly, this design choice allows us to exactly re-construct the input sketch
216
+ when there is only one of them.
217
+ */
218
+ inline void merge_items(const var_opt_sketch<T,S,A>& sk);
219
+ inline void merge_items(var_opt_sketch<T,S,A>&& sk);
220
+ inline void resolve_tau(const var_opt_sketch<T,S,A>& sketch);
221
+
222
+ double get_outer_tau() const;
223
+
224
+ var_opt_sketch<T,S,A> simple_gadget_coercer() const;
225
+
226
+ bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
227
+ bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const;
228
+ void mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const;
229
+ void migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& sk) const;
230
+
231
+ static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
232
+ static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
233
+ };
234
+
235
+ }
236
+
237
+ #include "var_opt_union_impl.hpp"
238
+
239
+ #endif // _VAR_OPT_UNION_HPP_
@@ -0,0 +1,645 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _VAR_OPT_UNION_IMPL_HPP_
21
+ #define _VAR_OPT_UNION_IMPL_HPP_
22
+
23
+ #include "var_opt_union.hpp"
24
+
25
+ #include <cmath>
26
+ #include <sstream>
27
+
28
+ namespace datasketches {
29
+
30
+ template<typename T, typename S, typename A>
31
+ var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
32
+ n_(0),
33
+ outer_tau_numer_(0),
34
+ outer_tau_denom_(0.0),
35
+ max_k_(max_k),
36
+ gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
37
+ {}
38
+
39
+ template<typename T, typename S, typename A>
40
+ var_opt_union<T,S,A>::var_opt_union(const var_opt_union& other) :
41
+ n_(other.n_),
42
+ outer_tau_numer_(other.outer_tau_numer_),
43
+ outer_tau_denom_(other.outer_tau_denom_),
44
+ max_k_(other.max_k_),
45
+ gadget_(other.gadget_)
46
+ {}
47
+
48
+ template<typename T, typename S, typename A>
49
+ var_opt_union<T,S,A>::var_opt_union(var_opt_union&& other) noexcept :
50
+ n_(other.n_),
51
+ outer_tau_numer_(other.outer_tau_numer_),
52
+ outer_tau_denom_(other.outer_tau_denom_),
53
+ max_k_(other.max_k_),
54
+ gadget_(std::move(other.gadget_))
55
+ {}
56
+
57
+ template<typename T, typename S, typename A>
58
+ var_opt_union<T,S,A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
59
+ uint32_t max_k, var_opt_sketch<T,S,A>&& gadget) :
60
+ n_(n),
61
+ outer_tau_numer_(outer_tau_numer),
62
+ outer_tau_denom_(outer_tau_denom),
63
+ max_k_(max_k),
64
+ gadget_(gadget)
65
+ {}
66
+
67
+ template<typename T, typename S, typename A>
68
+ var_opt_union<T,S,A>::~var_opt_union() {}
69
+
70
+ template<typename T, typename S, typename A>
71
+ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(const var_opt_union& other) {
72
+ var_opt_union<T,S,A> union_copy(other);
73
+ std::swap(n_, union_copy.n_);
74
+ std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
75
+ std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
76
+ std::swap(max_k_, union_copy.max_k_);
77
+ std::swap(gadget_, union_copy.gadget_);
78
+ return *this;
79
+ }
80
+
81
+ template<typename T, typename S, typename A>
82
+ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
83
+ std::swap(n_, other.n_);
84
+ std::swap(outer_tau_numer_, other.outer_tau_numer_);
85
+ std::swap(outer_tau_denom_, other.outer_tau_denom_);
86
+ std::swap(max_k_, other.max_k_);
87
+ std::swap(gadget_, other.gadget_);
88
+ return *this;
89
+ }
90
+
91
+ /*
92
+ * An empty union requires 8 bytes.
93
+ *
94
+ * <pre>
95
+ * Long || Start Byte Adr:
96
+ * Adr:
97
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
98
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
99
+ * </pre>
100
+ *
101
+ * A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are
102
+ * at least k items the sketch uses 32 bytes of preamble.
103
+ *
104
+ * The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent
105
+ * unused preamble bits. The acceptance probability for an item is a double in the range [0,1),
106
+ * limiting us to 53 bits of randomness due to details of the IEEE floating point format. To
107
+ * ensure meaningful probabilities as the items seen count approaches capacity, we intentionally
108
+ * use slightly fewer bits.
109
+ *
110
+ * Following the header are weights for the heavy items, then marks in the event this is a gadget.
111
+ * The serialized items come last.
112
+ *
113
+ * <pre>
114
+ * Long || Start Byte Adr:
115
+ * Adr:
116
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
117
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
118
+ *
119
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
120
+ * 1 ||---------------------------Items Seen Count (N)--------------------------------|
121
+ *
122
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
123
+ * 2 ||------------------------Outer Tau Numerator (double)---------------------------|
124
+ *
125
+ * || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
126
+ * 3 ||----------------------Outer Tau Denominator (uint64_t)-------------------------|
127
+ * </pre>
128
+ */
129
+
130
+ template<typename T, typename S, typename A>
131
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
132
+ uint8_t preamble_longs;
133
+ is.read((char*)&preamble_longs, sizeof(preamble_longs));
134
+ uint8_t serial_version;
135
+ is.read((char*)&serial_version, sizeof(serial_version));
136
+ uint8_t family_id;
137
+ is.read((char*)&family_id, sizeof(family_id));
138
+ uint8_t flags;
139
+ is.read((char*)&flags, sizeof(flags));
140
+ uint32_t max_k;
141
+ is.read((char*)&max_k, sizeof(max_k));
142
+
143
+ check_preamble_longs(preamble_longs, flags);
144
+ check_family_and_serialization_version(family_id, serial_version);
145
+
146
+ if (max_k == 0 || max_k > MAX_K) {
147
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
148
+ }
149
+
150
+ bool is_empty = flags & EMPTY_FLAG_MASK;
151
+
152
+ if (is_empty) {
153
+ if (!is.good())
154
+ throw std::runtime_error("error reading from std::istream");
155
+ else
156
+ return var_opt_union<T,S,A>(max_k);
157
+ }
158
+
159
+ uint64_t items_seen;
160
+ is.read((char*)&items_seen, sizeof(items_seen));
161
+ double outer_tau_numer;
162
+ is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
163
+ uint64_t outer_tau_denom;
164
+ is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
165
+
166
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is);
167
+
168
+ if (!is.good())
169
+ throw std::runtime_error("error reading from std::istream");
170
+
171
+ return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
172
+ }
173
+
174
+ template<typename T, typename S, typename A>
175
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size) {
176
+ ensure_minimum_memory(size, 8);
177
+ const char* ptr = static_cast<const char*>(bytes);
178
+ uint8_t preamble_longs;
179
+ ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
180
+ uint8_t serial_version;
181
+ ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
182
+ uint8_t family_id;
183
+ ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
184
+ uint8_t flags;
185
+ ptr += copy_from_mem(ptr, &flags, sizeof(flags));
186
+ uint32_t max_k;
187
+ ptr += copy_from_mem(ptr, &max_k, sizeof(max_k));
188
+
189
+ check_preamble_longs(preamble_longs, flags);
190
+ check_family_and_serialization_version(family_id, serial_version);
191
+
192
+ if (max_k == 0 || max_k > MAX_K) {
193
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
194
+ }
195
+
196
+ bool is_empty = flags & EMPTY_FLAG_MASK;
197
+
198
+ if (is_empty) {
199
+ return var_opt_union<T,S,A>(max_k);
200
+ }
201
+
202
+ uint64_t items_seen;
203
+ ptr += copy_from_mem(ptr, &items_seen, sizeof(items_seen));
204
+ double outer_tau_numer;
205
+ ptr += copy_from_mem(ptr, &outer_tau_numer, sizeof(outer_tau_numer));
206
+ uint64_t outer_tau_denom;
207
+ ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
208
+
209
+ const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
210
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size);
211
+
212
+ return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
213
+ }
214
+
215
+ template<typename T, typename S, typename A>
216
+ size_t var_opt_union<T,S,A>::get_serialized_size_bytes() const {
217
+ if (n_ == 0) {
218
+ return PREAMBLE_LONGS_EMPTY << 3;
219
+ } else {
220
+ return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes();
221
+ }
222
+ }
223
+
224
+ template<typename T, typename S, typename A>
225
+ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
226
+ bool empty = (n_ == 0);
227
+
228
+ const uint8_t serialization_version(SER_VER);
229
+ const uint8_t family_id(FAMILY_ID);
230
+
231
+ uint8_t preamble_longs;
232
+ uint8_t flags;
233
+ if (empty) {
234
+ preamble_longs = PREAMBLE_LONGS_EMPTY;
235
+ flags = EMPTY_FLAG_MASK;
236
+ } else {
237
+ preamble_longs = PREAMBLE_LONGS_NON_EMPTY;
238
+ flags = 0;
239
+ }
240
+
241
+ os.write((char*) &preamble_longs, sizeof(uint8_t));
242
+ os.write((char*) &serialization_version, sizeof(uint8_t));
243
+ os.write((char*) &family_id, sizeof(uint8_t));
244
+ os.write((char*) &flags, sizeof(uint8_t));
245
+ os.write((char*) &max_k_, sizeof(uint32_t));
246
+
247
+ if (!empty) {
248
+ os.write((char*) &n_, sizeof(uint64_t));
249
+ os.write((char*) &outer_tau_numer_, sizeof(double));
250
+ os.write((char*) &outer_tau_denom_, sizeof(uint64_t));
251
+ gadget_.serialize(os);
252
+ }
253
+ }
254
+
255
+ template<typename T, typename S, typename A>
256
+ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
257
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
258
+ std::vector<uint8_t, AllocU8<A>> bytes(size);
259
+ uint8_t* ptr = bytes.data() + header_size_bytes;
260
+
261
+ const bool empty = n_ == 0;
262
+
263
+ const uint8_t serialization_version(SER_VER);
264
+ const uint8_t family_id(FAMILY_ID);
265
+
266
+ uint8_t preamble_longs;
267
+ uint8_t flags;
268
+
269
+ if (empty) {
270
+ preamble_longs = PREAMBLE_LONGS_EMPTY;
271
+ flags = EMPTY_FLAG_MASK;
272
+ } else {
273
+ preamble_longs = PREAMBLE_LONGS_NON_EMPTY;
274
+ flags = 0;
275
+ }
276
+
277
+ // first prelong
278
+ ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
279
+ ptr += copy_to_mem(&serialization_version, ptr, sizeof(uint8_t));
280
+ ptr += copy_to_mem(&family_id, ptr, sizeof(uint8_t));
281
+ ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
282
+ ptr += copy_to_mem(&max_k_, ptr, sizeof(uint32_t));
283
+
284
+ if (!empty) {
285
+ ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
286
+ ptr += copy_to_mem(&outer_tau_numer_, ptr, sizeof(double));
287
+ ptr += copy_to_mem(&outer_tau_denom_, ptr, sizeof(uint64_t));
288
+
289
+ auto gadget_bytes = gadget_.serialize();
290
+ ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
291
+ }
292
+
293
+ return bytes;
294
+ }
295
+
296
+ template<typename T, typename S, typename A>
297
+ void var_opt_union<T,S,A>::reset() {
298
+ n_ = 0;
299
+ outer_tau_numer_ = 0.0;
300
+ outer_tau_denom_ = 0;
301
+ gadget_.reset();
302
+ }
303
+
304
+ template<typename T, typename S, typename A>
305
+ string<A> var_opt_union<T,S,A>::to_string() const {
306
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
307
+ os << "### VarOpt Union SUMMARY: " << std::endl;
308
+ os << " . n : " << n_ << std::endl;
309
+ os << " Max k : " << max_k_ << std::endl;
310
+ os << " Gadget Summary: " << std::endl;
311
+ os << gadget_.to_string();
312
+ os << "### END VarOpt Union SUMMARY: " << std::endl;
313
+ return os.str();
314
+ }
315
+
316
+ template<typename T, typename S, typename A>
317
+ void var_opt_union<T,S,A>::update(const var_opt_sketch<T,S,A>& sk) {
318
+ merge_items(sk);
319
+ resolve_tau(sk);
320
+ }
321
+
322
+ template<typename T, typename S, typename A>
323
+ void var_opt_union<T,S,A>::update(var_opt_sketch<T,S,A>&& sk) {
324
+ merge_items(std::move(sk));
325
+ resolve_tau(sk); // don't need items, so ok even if they've been moved out
326
+ }
327
+
328
+ template<typename T, typename S, typename A>
329
+ double var_opt_union<T,S,A>::get_outer_tau() const {
330
+ if (outer_tau_denom_ == 0) {
331
+ return 0.0;
332
+ } else {
333
+ return outer_tau_numer_ / outer_tau_denom_;
334
+ }
335
+ }
336
+
337
+ template<typename T, typename S, typename A>
338
+ void var_opt_union<T,S,A>::merge_items(const var_opt_sketch<T,S,A>& sketch) {
339
+ if (sketch.n_ == 0) {
340
+ return;
341
+ }
342
+
343
+ n_ += sketch.n_;
344
+
345
+ // H region const_iterator
346
+ typename var_opt_sketch<T,S,A>::const_iterator h_itr(sketch, false, false);
347
+ typename var_opt_sketch<T,S,A>::const_iterator h_end(sketch, true, false);
348
+ while (h_itr != h_end) {
349
+ std::pair<const T&, const double> sample = *h_itr;
350
+ gadget_.update(sample.first, sample.second, false);
351
+ ++h_itr;
352
+ }
353
+
354
+ // Weight-correcting R region iterator (const_iterator doesn't do the correction)
355
+ typename var_opt_sketch<T,S,A>::iterator r_itr(sketch, false, true);
356
+ typename var_opt_sketch<T,S,A>::iterator r_end(sketch, true, true);
357
+ while (r_itr != r_end) {
358
+ std::pair<const T&, const double> sample = *r_itr;
359
+ gadget_.update(sample.first, sample.second, true);
360
+ ++r_itr;
361
+ }
362
+ }
363
+
364
+ template<typename T, typename S, typename A>
365
+ void var_opt_union<T,S,A>::merge_items(var_opt_sketch<T,S,A>&& sketch) {
366
+ if (sketch.n_ == 0) {
367
+ return;
368
+ }
369
+
370
+ n_ += sketch.n_;
371
+
372
+ // H region iterator
373
+ typename var_opt_sketch<T,S,A>::iterator h_itr(sketch, false, false);
374
+ typename var_opt_sketch<T,S,A>::iterator h_end(sketch, true, false);
375
+ while (h_itr != h_end) {
376
+ std::pair<T&, double> sample = *h_itr;
377
+ gadget_.update(std::move(sample.first), sample.second, false);
378
+ ++h_itr;
379
+ }
380
+
381
+ // Weight-correcting R region iterator
382
+ typename var_opt_sketch<T,S,A>::iterator r_itr(sketch, false, true);
383
+ typename var_opt_sketch<T,S,A>::iterator r_end(sketch, true, true);
384
+ while (r_itr != r_end) {
385
+ std::pair<T&, double> sample = *r_itr;
386
+ gadget_.update(std::move(sample.first), sample.second, true);
387
+ ++r_itr;
388
+ }
389
+ }
390
+
391
+ template<typename T, typename S, typename A>
392
+ void var_opt_union<T,S,A>::resolve_tau(const var_opt_sketch<T,S,A>& sketch) {
393
+ if (sketch.r_ > 0) {
394
+ const double sketch_tau = sketch.get_tau();
395
+ const double outer_tau = get_outer_tau();
396
+
397
+ if (outer_tau_denom_ == 0) {
398
+ // detect first estimation mode sketch and grab its tau
399
+ outer_tau_numer_ = sketch.total_wt_r_;
400
+ outer_tau_denom_ = sketch.r_;
401
+ } else if (sketch_tau > outer_tau) {
402
+ // switch to a bigger value of outer_tau
403
+ outer_tau_numer_ = sketch.total_wt_r_;
404
+ outer_tau_denom_ = sketch.r_;
405
+ } else if (sketch_tau == outer_tau) {
406
+ // Ok if previous equality test isn't quite perfect. Mistakes in either direction should
407
+ // be fairly benign.
408
+ // Without conceptually changing outer_tau, update number and denominator. In particular,
409
+ // add the total weight of the incoming reservoir to the running total.
410
+ outer_tau_numer_ += sketch.total_wt_r_;
411
+ outer_tau_denom_ += sketch.r_;
412
+ }
413
+
414
+ // do nothing if sketch's tau is smaller than outer_tau
415
+ }
416
+ }
417
+
418
+ template<typename T, typename S, typename A>
419
+ var_opt_sketch<T,S,A> var_opt_union<T,S,A>::get_result() const {
420
+ // If no marked items in H, gadget is already valid mathematically. We can return what is
421
+ // basically just a copy of the gadget.
422
+ if (gadget_.num_marks_in_h_ == 0) {
423
+ return simple_gadget_coercer();
424
+ } else {
425
+ // Copy of gadget. This may produce needless copying in the
426
+ // pseudo-exact case below, but should simplify the code without
427
+ // needing to make the gadget a pointer
428
+ var_opt_sketch<T,S,A> gcopy(gadget_, false, n_);
429
+
430
+ // At this point, we know that marked items are present in H. So:
431
+ // 1. Result will necessarily be in estimation mode
432
+ // 2. Marked items currently in H need to be absorbed into reservoir (R)
433
+ const bool is_pseudo_exact = detect_and_handle_subcase_of_pseudo_exact(gcopy);
434
+ if (!is_pseudo_exact) {
435
+ // continue with main logic
436
+ migrate_marked_items_by_decreasing_k(gcopy);
437
+ }
438
+ // sub-case was already detected and handled, so return the result
439
+ return gcopy;
440
+ }
441
+ }
442
+
443
+ /**
444
+ * When there are no marked items in H, the gadget is mathematically equivalent to a valid
445
+ * varopt sketch. This method simply returns a copy (without perserving marks).
446
+ *
447
+ * @return A shallow copy of the gadget as valid varopt sketch
448
+ */
449
+ template<typename T, typename S, typename A>
450
+ var_opt_sketch<T,S,A> var_opt_union<T,S,A>::simple_gadget_coercer() const {
451
+ if (gadget_.num_marks_in_h_ != 0) throw std::logic_error("simple gadget coercer only applies if no marks");
452
+ return var_opt_sketch<T,S,A>(gadget_, true, n_);
453
+ }
454
+
455
+ // this is a condition checked in detect_and_handle_subcase_of_pseudo_exact()
456
+ template<typename T, typename S, typename A>
457
+ bool var_opt_union<T,S,A>::there_exist_unmarked_h_items_lighter_than_target(double threshold) const {
458
+ for (uint32_t i = 0; i < gadget_.h_; ++i) {
459
+ if ((gadget_.weights_[i] < threshold) && !gadget_.marks_[i]) {
460
+ return true;
461
+ }
462
+ }
463
+ return false;
464
+ }
465
+
466
+ template<typename T, typename S, typename A>
467
+ bool var_opt_union<T,S,A>::detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const {
468
+ // gadget is seemingly exact
469
+ const bool condition1 = gadget_.r_ == 0;
470
+
471
+ // but there are marked items in H, so only _pseudo_ exact
472
+ const bool condition2 = gadget_.num_marks_in_h_ > 0;
473
+
474
+ // if gadget is pseudo-exact and the number of marks equals outer_tau_denom, then we can deduce
475
+ // from the bookkeeping logic of resolve_tau() that all estimation mode input sketches must
476
+ // have had the same tau, so we can throw all of the marked items into a common reservoir.
477
+ const bool condition3 = gadget_.num_marks_in_h_ == outer_tau_denom_;
478
+
479
+ if (!(condition1 && condition2 && condition3)) {
480
+ return false;
481
+ } else {
482
+
483
+ // explicitly enforce rule that items in H should not be lighter than the sketch's tau
484
+ const bool anti_condition4 = there_exist_unmarked_h_items_lighter_than_target(gadget_.get_tau());
485
+ if (anti_condition4) {
486
+ return false;
487
+ } else {
488
+ // conditions 1 through 4 hold
489
+ mark_moving_gadget_coercer(sk);
490
+ return true;
491
+ }
492
+ }
493
+ }
494
+
495
+ /**
496
+ * This coercer directly transfers marked items from the gadget's H into the result's R.
497
+ * Deciding whether that is a valid thing to do is the responsibility of the caller. Currently,
498
+ * this is only used for a subcase of pseudo-exact, but later it might be used by other
499
+ * subcases as well.
500
+ *
501
+ * @param sk Copy of the gadget, modified with marked items moved to the reservoir
502
+ */
503
+ template<typename T, typename S, typename A>
504
+ void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const {
505
+ const uint32_t result_k = gadget_.h_ + gadget_.r_;
506
+
507
+ uint32_t result_h = 0;
508
+ uint32_t result_r = 0;
509
+ size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
510
+
511
+ typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
512
+ double* wts = AllocDouble().allocate(result_k + 1);
513
+ T* data = A().allocate(result_k + 1);
514
+
515
+ // insert R region items, ignoring weights
516
+ // Currently (May 2017) this next block is unreachable; this coercer is used only in the
517
+ // pseudo-exact case in which case there are no items natively in R, only marked items in H
518
+ // that will be moved into R as part of the coercion process.
519
+ // Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
520
+ const size_t final_idx = gadget_.get_num_samples();
521
+ for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
522
+ A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
523
+ wts[next_r_pos] = gadget_.weights_[idx];
524
+ ++result_r;
525
+ --next_r_pos;
526
+ }
527
+
528
+ double transferred_weight = 0;
529
+
530
+ // insert H region items
531
+ for (size_t idx = 0; idx < gadget_.h_; ++idx) {
532
+ if (gadget_.marks_[idx]) {
533
+ A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
534
+ wts[next_r_pos] = -1.0;
535
+ transferred_weight += gadget_.weights_[idx];
536
+ ++result_r;
537
+ --next_r_pos;
538
+ } else {
539
+ A().construct(&data[result_h], T(gadget_.data_[idx]));
540
+ wts[result_h] = gadget_.weights_[idx];
541
+ ++result_h;
542
+ }
543
+ }
544
+
545
+ if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
546
+ if (fabs(transferred_weight - outer_tau_numer_) > 1e-10) {
547
+ throw std::logic_error("uexpected mismatch in transferred weight");
548
+ }
549
+
550
+ const double result_r_weight = gadget_.total_wt_r_ + transferred_weight;
551
+ const uint64_t result_n = n_;
552
+
553
+ // explicitly set weight value for the gap
554
+ wts[result_h] = -1.0;
555
+
556
+ // clean up arrays in input sketch, replace with new values
557
+ typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
558
+ AllocBool().deallocate(sk.marks_, sk.curr_items_alloc_);
559
+ AllocDouble().deallocate(sk.weights_, sk.curr_items_alloc_);
560
+ for (size_t i = 0; i < result_k; ++i) { A().destroy(sk.data_ + i); } // assumes everything in H region, no gap
561
+ A().deallocate(sk.data_, sk.curr_items_alloc_);
562
+
563
+ sk.data_ = data;
564
+ sk.weights_ = wts;
565
+ sk.marks_ = nullptr;
566
+ sk.num_marks_in_h_ = 0;
567
+ sk.curr_items_alloc_ = result_k + 1;
568
+ sk.k_ = result_k;
569
+ sk.n_ = result_n;
570
+ sk.h_ = result_h;
571
+ sk.r_ = result_r;
572
+ sk.total_wt_r_ = result_r_weight;
573
+ }
574
+
575
+ // this is basically a continuation of get_result(), but modifying the input gadget copy
576
+ template<typename T, typename S, typename A>
577
+ void var_opt_union<T,S,A>::migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& gcopy) const {
578
+ const uint32_t r_count = gcopy.r_;
579
+ const uint32_t h_count = gcopy.h_;
580
+ const uint32_t k = gcopy.k_;
581
+
582
+ // should be ensured by caller
583
+ if (gcopy.num_marks_in_h_ == 0) throw std::logic_error("unexpectedly found no marked items to migrate");
584
+ // either full (of samples), in pseudo-exact mode, or both
585
+ if ((r_count != 0) && ((h_count + r_count) != k)) throw std::logic_error("invalid gadget state");
586
+
587
+ // if non-full and pseudo-exact, change k so that gcopy is full
588
+ if ((r_count == 0) && (h_count < k)) {
589
+ gcopy.k_ = h_count; // may leve extra space allocated but that's ok
590
+ }
591
+
592
+ // Now k equals the number of samples, so reducing k will increase tau.
593
+ // Also, we know that there are at least 2 samples because 0 or 1 would have been handled
594
+ // by the earlier logic in get_result()
595
+ gcopy.decrease_k_by_1();
596
+
597
+ // gcopy is now in estimation mode, just like the final result must be (due to marked items)
598
+ if (gcopy.get_tau() == 0.0) throw std::logic_error("gadget must be in sampling mode");
599
+
600
+ // keep reducing k until all marked items have been absorbed into the reservoir
601
+ while (gcopy.num_marks_in_h_ > 0) {
602
+ // gcopy.k_ >= 2 because h_ and r_ are both at least 1, but checked in next method anyway
603
+ gcopy.decrease_k_by_1();
604
+ }
605
+
606
+ gcopy.strip_marks();
607
+ }
608
+
609
+ template<typename T, typename S, typename A>
610
+ void var_opt_union<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
611
+ bool is_empty(flags & EMPTY_FLAG_MASK);
612
+
613
+ if (is_empty) {
614
+ if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
615
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
616
+ + std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
617
+ + std::to_string(preamble_longs));
618
+ }
619
+ } else {
620
+ if (preamble_longs != PREAMBLE_LONGS_NON_EMPTY) {
621
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
622
+ + std::to_string(PREAMBLE_LONGS_NON_EMPTY)
623
+ + " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
624
+ }
625
+ }
626
+ }
627
+
628
+ template<typename T, typename S, typename A>
629
+ void var_opt_union<T,S,A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
630
+ if (family_id == FAMILY_ID) {
631
+ if (ser_ver != SER_VER) {
632
+ throw std::invalid_argument("Possible corruption: VarOpt Union serialization version must be "
633
+ + std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));
634
+ }
635
+ return;
636
+ }
637
+ // TODO: extend to handle reservoir sampling
638
+
639
+ throw std::invalid_argument("Possible corruption: VarOpt Union family id must be "
640
+ + std::to_string(FAMILY_ID) + ". Found: " + std::to_string(family_id));
641
+ }
642
+
643
+ } // namespace datasketches
644
+
645
+ #endif // _VAR_OPT_UNION_IMPL_HPP_