datasketches 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,239 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _VAR_OPT_UNION_HPP_
21
+ #define _VAR_OPT_UNION_HPP_
22
+
23
+ #include "var_opt_sketch.hpp"
24
+ #include "common_defs.hpp"
25
+ #include "serde.hpp"
26
+
27
+ #include <vector>
28
+
29
+ namespace datasketches {
30
+
31
+ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
32
+
33
+ /**
34
+ * Provides a unioning operation over var_opt_sketch objects. This union allows
35
+ * the sample size k to float, possibly increasing or decreasing as warranted by
36
+ * the available data.
37
+ *
38
+ * The union currently allows serialization and deserialization, even though transporting
39
+ * union objects seems to be an anti-pattern with most sketches. We currently provide it here
40
+ * because the get_result() call may need to discard samples and decrease k in order to
41
+ * return a valid sketch, even if future calls to update() would allow k to remain larger.
42
+ *
43
+ * The (de)serialization methods may be deprecated and subsequently removed in future versions.
44
+ *
45
+ * author Kevin Lang
46
+ * author Jon Malkin
47
+ */
48
+ template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
49
+ class var_opt_union {
50
+
51
+ public:
52
+ static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
53
+
54
+ explicit var_opt_union(uint32_t max_k);
55
+ var_opt_union(const var_opt_union& other);
56
+ var_opt_union(var_opt_union&& other) noexcept;
57
+
58
+ ~var_opt_union();
59
+
60
+ var_opt_union& operator=(const var_opt_union& other);
61
+ var_opt_union& operator=(var_opt_union&& other);
62
+
63
+ /**
64
+ * Updates this union with the given sketch
65
+ * This method takes an lvalue.
66
+ * @param sk a sketch to add to the union
67
+ */
68
+ void update(const var_opt_sketch<T,S,A>& sk);
69
+
70
+ /**
71
+ * Updates this union with the given sketch
72
+ * This method takes an rvalue.
73
+ * @param sk a sketch to add to the union
74
+ */
75
+ void update(var_opt_sketch<T,S,A>&& sk);
76
+
77
+ /**
78
+ * Gets the varopt sketch resulting from the union of any input sketches.
79
+ * @return a varopt sketch
80
+ */
81
+ var_opt_sketch<T,S,A> get_result() const;
82
+
83
+ /**
84
+ * Resets the union to its default, empty state.
85
+ */
86
+ void reset();
87
+
88
+ /**
89
+ * Computes size needed to serialize the current state of the union.
90
+ * This version is for all other types and can be expensive since every item needs to be looked at.
91
+ * @return size in bytes needed to serialize this sketch
92
+ */
93
+ size_t get_serialized_size_bytes() const;
94
+
95
+ // This is a convenience alias for users
96
+ // The type returned by the following serialize method
97
+ typedef vector_u8<A> vector_bytes;
98
+
99
+ /**
100
+ * NOTE: This method may be deprecated in a future version.
101
+ * This method serializes the sketch as a vector of bytes.
102
+ * An optional header can be reserved in front of the sketch.
103
+ * It is a blank space of a given size.
104
+ * This header is used in Datasketches PostgreSQL extension.
105
+ * @param header_size_bytes space to reserve in front of the sketch
106
+ */
107
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
108
+
109
+ /**
110
+ * NOTE: This method may be deprecated in a future version.
111
+ * This method serializes the sketch into a given stream in a binary form
112
+ * @param os output stream
113
+ */
114
+ void serialize(std::ostream& os) const;
115
+
116
+ /**
117
+ * NOTE: This method may be deprecated in a future version.
118
+ * This method deserializes a union from a given stream.
119
+ * @param is input stream
120
+ * @return an instance of a union
121
+ */
122
+ static var_opt_union deserialize(std::istream& is);
123
+
124
+ /**
125
+ * NOTE: This method may be deprecated in a future version.
126
+ * This method deserializes a skeuniontch from a given array of bytes.
127
+ * @param bytes pointer to the array of bytes
128
+ * @param size the size of the array
129
+ * @return an instance of a union
130
+ */
131
+ static var_opt_union deserialize(const void* bytes, size_t size);
132
+
133
+ /**
134
+ * Prints a summary of the union as a string.
135
+ * @return the summary as a string
136
+ */
137
+ string<A> to_string() const;
138
+
139
+
140
+ private:
141
+ typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T,S,A>> AllocSketch;
142
+
143
+ static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
144
+ static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
145
+ static const uint8_t SER_VER = 2;
146
+ static const uint8_t FAMILY_ID = 14;
147
+ static const uint8_t EMPTY_FLAG_MASK = 4;
148
+
149
+ uint64_t n_; // cumulative over all input sketches
150
+
151
+ // outer tau is the largest tau of any input sketch
152
+ double outer_tau_numer_; // total weight of all input R-zones where tau = outer_tau
153
+
154
+ // total cardinality of the same R-zones, or zero if no input sketch was in estimation mode
155
+ uint64_t outer_tau_denom_;
156
+
157
+ uint32_t max_k_;
158
+
159
+ var_opt_sketch<T,S,A> gadget_;
160
+
161
+ var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
162
+ uint32_t max_k, var_opt_sketch<T,S,A>&& gadget);
163
+
164
+ /*
165
+ IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
166
+ but in fact is NOT because it doesn't satisfy the mathematical definition
167
+ of a varopt sketch of the concatenated input streams. Therefore it could be different
168
+ from a true varopt sketch with that value of K, in which case it could easily provide
169
+ worse estimation accuracy for subset-sum queries.
170
+
171
+ This should not surprise you; the approximation guarantees of varopt sketches
172
+ do not apply to things that merely resemble varopt sketches.
173
+
174
+ However, even though the gadget is not a varopt sketch, the result
175
+ of the unioning process IS a varopt sketch. It is constructed by a
176
+ somewhat complicated "resolution" process which determines the largest K
177
+ that a valid varopt sketch could have given the available information,
178
+ then constructs a varopt sketch of that size and returns it.
179
+
180
+ However, the gadget itself is not touched during the resolution process,
181
+ and additional sketches could subsequently be merged into the union,
182
+ at which point a varopt result could again be requested.
183
+ */
184
+
185
+ /*
186
+ Explanation of "marked items" in the union's gadget:
187
+
188
+ The boolean value "true" in an pair indicates that the item
189
+ came from an input sketch's R zone, so it is already the result of sampling.
190
+
191
+ Therefore it must not wind up in the H zone of the final result, because
192
+ that would imply that the item is "exact".
193
+
194
+ However, it is okay for a marked item to hang out in the gadget's H zone for a while.
195
+
196
+ And once the item has moved to the gadget's R zone, the mark is never checked again,
197
+ so no effort is made to ensure that its value is preserved or even makes sense.
198
+ */
199
+
200
+ /*
201
+ Note: if the computer could perform exact real-valued arithmetic, the union could finalize
202
+ its result by reducing k until inner_tau > outer_tau. [Due to the vagaries of floating point
203
+ arithmetic, we won't attempt to detect and specially handle the inner_tau = outer_tau special
204
+ case.]
205
+
206
+ In fact, we won't even look at tau while while reducing k. Instead the logic will be based
207
+ on the more robust integer quantity num_marks_in_h_ in the gadget. It is conceivable that due
208
+ to round-off error we could end up with inner_tau slightly less than outer_tau, but that should
209
+ be fairly harmless since we will have achieved our goal of getting the marked items out of H.
210
+
211
+ Also, you might be wondering why we are bothering to maintain the numerator and denominator
212
+ separately instead of just having a single variable outer_tau. This allows us (in certain
213
+ cases) to add an input's entire R-zone weight into the result sketch, as opposed to subdividing
214
+ it then adding it back up. That would be a source of numerical inaccuracy. And even
215
+ more importantly, this design choice allows us to exactly re-construct the input sketch
216
+ when there is only one of them.
217
+ */
218
+ inline void merge_items(const var_opt_sketch<T,S,A>& sk);
219
+ inline void merge_items(var_opt_sketch<T,S,A>&& sk);
220
+ inline void resolve_tau(const var_opt_sketch<T,S,A>& sketch);
221
+
222
+ double get_outer_tau() const;
223
+
224
+ var_opt_sketch<T,S,A> simple_gadget_coercer() const;
225
+
226
+ bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
227
+ bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const;
228
+ void mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const;
229
+ void migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& sk) const;
230
+
231
+ static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
232
+ static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
233
+ };
234
+
235
+ }
236
+
237
+ #include "var_opt_union_impl.hpp"
238
+
239
+ #endif // _VAR_OPT_UNION_HPP_
@@ -0,0 +1,645 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _VAR_OPT_UNION_IMPL_HPP_
21
+ #define _VAR_OPT_UNION_IMPL_HPP_
22
+
23
+ #include "var_opt_union.hpp"
24
+
25
+ #include <cmath>
26
+ #include <sstream>
27
+
28
+ namespace datasketches {
29
+
30
+ template<typename T, typename S, typename A>
31
+ var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
32
+ n_(0),
33
+ outer_tau_numer_(0),
34
+ outer_tau_denom_(0.0),
35
+ max_k_(max_k),
36
+ gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
37
+ {}
38
+
39
+ template<typename T, typename S, typename A>
40
+ var_opt_union<T,S,A>::var_opt_union(const var_opt_union& other) :
41
+ n_(other.n_),
42
+ outer_tau_numer_(other.outer_tau_numer_),
43
+ outer_tau_denom_(other.outer_tau_denom_),
44
+ max_k_(other.max_k_),
45
+ gadget_(other.gadget_)
46
+ {}
47
+
48
+ template<typename T, typename S, typename A>
49
+ var_opt_union<T,S,A>::var_opt_union(var_opt_union&& other) noexcept :
50
+ n_(other.n_),
51
+ outer_tau_numer_(other.outer_tau_numer_),
52
+ outer_tau_denom_(other.outer_tau_denom_),
53
+ max_k_(other.max_k_),
54
+ gadget_(std::move(other.gadget_))
55
+ {}
56
+
57
+ template<typename T, typename S, typename A>
58
+ var_opt_union<T,S,A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
59
+ uint32_t max_k, var_opt_sketch<T,S,A>&& gadget) :
60
+ n_(n),
61
+ outer_tau_numer_(outer_tau_numer),
62
+ outer_tau_denom_(outer_tau_denom),
63
+ max_k_(max_k),
64
+ gadget_(gadget)
65
+ {}
66
+
67
+ template<typename T, typename S, typename A>
68
+ var_opt_union<T,S,A>::~var_opt_union() {}
69
+
70
+ template<typename T, typename S, typename A>
71
+ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(const var_opt_union& other) {
72
+ var_opt_union<T,S,A> union_copy(other);
73
+ std::swap(n_, union_copy.n_);
74
+ std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
75
+ std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
76
+ std::swap(max_k_, union_copy.max_k_);
77
+ std::swap(gadget_, union_copy.gadget_);
78
+ return *this;
79
+ }
80
+
81
+ template<typename T, typename S, typename A>
82
+ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
83
+ std::swap(n_, other.n_);
84
+ std::swap(outer_tau_numer_, other.outer_tau_numer_);
85
+ std::swap(outer_tau_denom_, other.outer_tau_denom_);
86
+ std::swap(max_k_, other.max_k_);
87
+ std::swap(gadget_, other.gadget_);
88
+ return *this;
89
+ }
90
+
91
+ /*
92
+ * An empty union requires 8 bytes.
93
+ *
94
+ * <pre>
95
+ * Long || Start Byte Adr:
96
+ * Adr:
97
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
98
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
99
+ * </pre>
100
+ *
101
+ * A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are
102
+ * at least k items the sketch uses 32 bytes of preamble.
103
+ *
104
+ * The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent
105
+ * unused preamble bits. The acceptance probability for an item is a double in the range [0,1),
106
+ * limiting us to 53 bits of randomness due to details of the IEEE floating point format. To
107
+ * ensure meaningful probabilities as the items seen count approaches capacity, we intentionally
108
+ * use slightly fewer bits.
109
+ *
110
+ * Following the header are weights for the heavy items, then marks in the event this is a gadget.
111
+ * The serialized items come last.
112
+ *
113
+ * <pre>
114
+ * Long || Start Byte Adr:
115
+ * Adr:
116
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
117
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
118
+ *
119
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
120
+ * 1 ||---------------------------Items Seen Count (N)--------------------------------|
121
+ *
122
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
123
+ * 2 ||------------------------Outer Tau Numerator (double)---------------------------|
124
+ *
125
+ * || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
126
+ * 3 ||----------------------Outer Tau Denominator (uint64_t)-------------------------|
127
+ * </pre>
128
+ */
129
+
130
+ template<typename T, typename S, typename A>
131
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
132
+ uint8_t preamble_longs;
133
+ is.read((char*)&preamble_longs, sizeof(preamble_longs));
134
+ uint8_t serial_version;
135
+ is.read((char*)&serial_version, sizeof(serial_version));
136
+ uint8_t family_id;
137
+ is.read((char*)&family_id, sizeof(family_id));
138
+ uint8_t flags;
139
+ is.read((char*)&flags, sizeof(flags));
140
+ uint32_t max_k;
141
+ is.read((char*)&max_k, sizeof(max_k));
142
+
143
+ check_preamble_longs(preamble_longs, flags);
144
+ check_family_and_serialization_version(family_id, serial_version);
145
+
146
+ if (max_k == 0 || max_k > MAX_K) {
147
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
148
+ }
149
+
150
+ bool is_empty = flags & EMPTY_FLAG_MASK;
151
+
152
+ if (is_empty) {
153
+ if (!is.good())
154
+ throw std::runtime_error("error reading from std::istream");
155
+ else
156
+ return var_opt_union<T,S,A>(max_k);
157
+ }
158
+
159
+ uint64_t items_seen;
160
+ is.read((char*)&items_seen, sizeof(items_seen));
161
+ double outer_tau_numer;
162
+ is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
163
+ uint64_t outer_tau_denom;
164
+ is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
165
+
166
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is);
167
+
168
+ if (!is.good())
169
+ throw std::runtime_error("error reading from std::istream");
170
+
171
+ return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
172
+ }
173
+
174
+ template<typename T, typename S, typename A>
175
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size) {
176
+ ensure_minimum_memory(size, 8);
177
+ const char* ptr = static_cast<const char*>(bytes);
178
+ uint8_t preamble_longs;
179
+ ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
180
+ uint8_t serial_version;
181
+ ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
182
+ uint8_t family_id;
183
+ ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
184
+ uint8_t flags;
185
+ ptr += copy_from_mem(ptr, &flags, sizeof(flags));
186
+ uint32_t max_k;
187
+ ptr += copy_from_mem(ptr, &max_k, sizeof(max_k));
188
+
189
+ check_preamble_longs(preamble_longs, flags);
190
+ check_family_and_serialization_version(family_id, serial_version);
191
+
192
+ if (max_k == 0 || max_k > MAX_K) {
193
+ throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
194
+ }
195
+
196
+ bool is_empty = flags & EMPTY_FLAG_MASK;
197
+
198
+ if (is_empty) {
199
+ return var_opt_union<T,S,A>(max_k);
200
+ }
201
+
202
+ uint64_t items_seen;
203
+ ptr += copy_from_mem(ptr, &items_seen, sizeof(items_seen));
204
+ double outer_tau_numer;
205
+ ptr += copy_from_mem(ptr, &outer_tau_numer, sizeof(outer_tau_numer));
206
+ uint64_t outer_tau_denom;
207
+ ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
208
+
209
+ const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
210
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size);
211
+
212
+ return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
213
+ }
214
+
215
+ template<typename T, typename S, typename A>
216
+ size_t var_opt_union<T,S,A>::get_serialized_size_bytes() const {
217
+ if (n_ == 0) {
218
+ return PREAMBLE_LONGS_EMPTY << 3;
219
+ } else {
220
+ return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes();
221
+ }
222
+ }
223
+
224
+ template<typename T, typename S, typename A>
225
+ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
226
+ bool empty = (n_ == 0);
227
+
228
+ const uint8_t serialization_version(SER_VER);
229
+ const uint8_t family_id(FAMILY_ID);
230
+
231
+ uint8_t preamble_longs;
232
+ uint8_t flags;
233
+ if (empty) {
234
+ preamble_longs = PREAMBLE_LONGS_EMPTY;
235
+ flags = EMPTY_FLAG_MASK;
236
+ } else {
237
+ preamble_longs = PREAMBLE_LONGS_NON_EMPTY;
238
+ flags = 0;
239
+ }
240
+
241
+ os.write((char*) &preamble_longs, sizeof(uint8_t));
242
+ os.write((char*) &serialization_version, sizeof(uint8_t));
243
+ os.write((char*) &family_id, sizeof(uint8_t));
244
+ os.write((char*) &flags, sizeof(uint8_t));
245
+ os.write((char*) &max_k_, sizeof(uint32_t));
246
+
247
+ if (!empty) {
248
+ os.write((char*) &n_, sizeof(uint64_t));
249
+ os.write((char*) &outer_tau_numer_, sizeof(double));
250
+ os.write((char*) &outer_tau_denom_, sizeof(uint64_t));
251
+ gadget_.serialize(os);
252
+ }
253
+ }
254
+
255
+ template<typename T, typename S, typename A>
256
+ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
257
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
258
+ std::vector<uint8_t, AllocU8<A>> bytes(size);
259
+ uint8_t* ptr = bytes.data() + header_size_bytes;
260
+
261
+ const bool empty = n_ == 0;
262
+
263
+ const uint8_t serialization_version(SER_VER);
264
+ const uint8_t family_id(FAMILY_ID);
265
+
266
+ uint8_t preamble_longs;
267
+ uint8_t flags;
268
+
269
+ if (empty) {
270
+ preamble_longs = PREAMBLE_LONGS_EMPTY;
271
+ flags = EMPTY_FLAG_MASK;
272
+ } else {
273
+ preamble_longs = PREAMBLE_LONGS_NON_EMPTY;
274
+ flags = 0;
275
+ }
276
+
277
+ // first prelong
278
+ ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
279
+ ptr += copy_to_mem(&serialization_version, ptr, sizeof(uint8_t));
280
+ ptr += copy_to_mem(&family_id, ptr, sizeof(uint8_t));
281
+ ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
282
+ ptr += copy_to_mem(&max_k_, ptr, sizeof(uint32_t));
283
+
284
+ if (!empty) {
285
+ ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
286
+ ptr += copy_to_mem(&outer_tau_numer_, ptr, sizeof(double));
287
+ ptr += copy_to_mem(&outer_tau_denom_, ptr, sizeof(uint64_t));
288
+
289
+ auto gadget_bytes = gadget_.serialize();
290
+ ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
291
+ }
292
+
293
+ return bytes;
294
+ }
295
+
296
+ template<typename T, typename S, typename A>
297
+ void var_opt_union<T,S,A>::reset() {
298
+ n_ = 0;
299
+ outer_tau_numer_ = 0.0;
300
+ outer_tau_denom_ = 0;
301
+ gadget_.reset();
302
+ }
303
+
304
+ template<typename T, typename S, typename A>
305
+ string<A> var_opt_union<T,S,A>::to_string() const {
306
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
307
+ os << "### VarOpt Union SUMMARY: " << std::endl;
308
+ os << " . n : " << n_ << std::endl;
309
+ os << " Max k : " << max_k_ << std::endl;
310
+ os << " Gadget Summary: " << std::endl;
311
+ os << gadget_.to_string();
312
+ os << "### END VarOpt Union SUMMARY: " << std::endl;
313
+ return os.str();
314
+ }
315
+
316
+ template<typename T, typename S, typename A>
317
+ void var_opt_union<T,S,A>::update(const var_opt_sketch<T,S,A>& sk) {
318
+ merge_items(sk);
319
+ resolve_tau(sk);
320
+ }
321
+
322
+ template<typename T, typename S, typename A>
323
+ void var_opt_union<T,S,A>::update(var_opt_sketch<T,S,A>&& sk) {
324
+ merge_items(std::move(sk));
325
+ resolve_tau(sk); // don't need items, so ok even if they've been moved out
326
+ }
327
+
328
+ template<typename T, typename S, typename A>
329
+ double var_opt_union<T,S,A>::get_outer_tau() const {
330
+ if (outer_tau_denom_ == 0) {
331
+ return 0.0;
332
+ } else {
333
+ return outer_tau_numer_ / outer_tau_denom_;
334
+ }
335
+ }
336
+
337
+ template<typename T, typename S, typename A>
338
+ void var_opt_union<T,S,A>::merge_items(const var_opt_sketch<T,S,A>& sketch) {
339
+ if (sketch.n_ == 0) {
340
+ return;
341
+ }
342
+
343
+ n_ += sketch.n_;
344
+
345
+ // H region const_iterator
346
+ typename var_opt_sketch<T,S,A>::const_iterator h_itr(sketch, false, false);
347
+ typename var_opt_sketch<T,S,A>::const_iterator h_end(sketch, true, false);
348
+ while (h_itr != h_end) {
349
+ std::pair<const T&, const double> sample = *h_itr;
350
+ gadget_.update(sample.first, sample.second, false);
351
+ ++h_itr;
352
+ }
353
+
354
+ // Weight-correcting R region iterator (const_iterator doesn't do the correction)
355
+ typename var_opt_sketch<T,S,A>::iterator r_itr(sketch, false, true);
356
+ typename var_opt_sketch<T,S,A>::iterator r_end(sketch, true, true);
357
+ while (r_itr != r_end) {
358
+ std::pair<const T&, const double> sample = *r_itr;
359
+ gadget_.update(sample.first, sample.second, true);
360
+ ++r_itr;
361
+ }
362
+ }
363
+
364
+ template<typename T, typename S, typename A>
365
+ void var_opt_union<T,S,A>::merge_items(var_opt_sketch<T,S,A>&& sketch) {
366
+ if (sketch.n_ == 0) {
367
+ return;
368
+ }
369
+
370
+ n_ += sketch.n_;
371
+
372
+ // H region iterator
373
+ typename var_opt_sketch<T,S,A>::iterator h_itr(sketch, false, false);
374
+ typename var_opt_sketch<T,S,A>::iterator h_end(sketch, true, false);
375
+ while (h_itr != h_end) {
376
+ std::pair<T&, double> sample = *h_itr;
377
+ gadget_.update(std::move(sample.first), sample.second, false);
378
+ ++h_itr;
379
+ }
380
+
381
+ // Weight-correcting R region iterator
382
+ typename var_opt_sketch<T,S,A>::iterator r_itr(sketch, false, true);
383
+ typename var_opt_sketch<T,S,A>::iterator r_end(sketch, true, true);
384
+ while (r_itr != r_end) {
385
+ std::pair<T&, double> sample = *r_itr;
386
+ gadget_.update(std::move(sample.first), sample.second, true);
387
+ ++r_itr;
388
+ }
389
+ }
390
+
391
+ template<typename T, typename S, typename A>
392
+ void var_opt_union<T,S,A>::resolve_tau(const var_opt_sketch<T,S,A>& sketch) {
393
+ if (sketch.r_ > 0) {
394
+ const double sketch_tau = sketch.get_tau();
395
+ const double outer_tau = get_outer_tau();
396
+
397
+ if (outer_tau_denom_ == 0) {
398
+ // detect first estimation mode sketch and grab its tau
399
+ outer_tau_numer_ = sketch.total_wt_r_;
400
+ outer_tau_denom_ = sketch.r_;
401
+ } else if (sketch_tau > outer_tau) {
402
+ // switch to a bigger value of outer_tau
403
+ outer_tau_numer_ = sketch.total_wt_r_;
404
+ outer_tau_denom_ = sketch.r_;
405
+ } else if (sketch_tau == outer_tau) {
406
+ // Ok if previous equality test isn't quite perfect. Mistakes in either direction should
407
+ // be fairly benign.
408
+ // Without conceptually changing outer_tau, update number and denominator. In particular,
409
+ // add the total weight of the incoming reservoir to the running total.
410
+ outer_tau_numer_ += sketch.total_wt_r_;
411
+ outer_tau_denom_ += sketch.r_;
412
+ }
413
+
414
+ // do nothing if sketch's tau is smaller than outer_tau
415
+ }
416
+ }
417
+
418
+ template<typename T, typename S, typename A>
419
+ var_opt_sketch<T,S,A> var_opt_union<T,S,A>::get_result() const {
420
+ // If no marked items in H, gadget is already valid mathematically. We can return what is
421
+ // basically just a copy of the gadget.
422
+ if (gadget_.num_marks_in_h_ == 0) {
423
+ return simple_gadget_coercer();
424
+ } else {
425
+ // Copy of gadget. This may produce needless copying in the
426
+ // pseudo-exact case below, but should simplify the code without
427
+ // needing to make the gadget a pointer
428
+ var_opt_sketch<T,S,A> gcopy(gadget_, false, n_);
429
+
430
+ // At this point, we know that marked items are present in H. So:
431
+ // 1. Result will necessarily be in estimation mode
432
+ // 2. Marked items currently in H need to be absorbed into reservoir (R)
433
+ const bool is_pseudo_exact = detect_and_handle_subcase_of_pseudo_exact(gcopy);
434
+ if (!is_pseudo_exact) {
435
+ // continue with main logic
436
+ migrate_marked_items_by_decreasing_k(gcopy);
437
+ }
438
+ // sub-case was already detected and handled, so return the result
439
+ return gcopy;
440
+ }
441
+ }
442
+
443
+ /**
444
+ * When there are no marked items in H, the gadget is mathematically equivalent to a valid
445
+ * varopt sketch. This method simply returns a copy (without perserving marks).
446
+ *
447
+ * @return A shallow copy of the gadget as valid varopt sketch
448
+ */
449
+ template<typename T, typename S, typename A>
450
+ var_opt_sketch<T,S,A> var_opt_union<T,S,A>::simple_gadget_coercer() const {
451
+ if (gadget_.num_marks_in_h_ != 0) throw std::logic_error("simple gadget coercer only applies if no marks");
452
+ return var_opt_sketch<T,S,A>(gadget_, true, n_);
453
+ }
454
+
455
+ // this is a condition checked in detect_and_handle_subcase_of_pseudo_exact()
456
+ template<typename T, typename S, typename A>
457
+ bool var_opt_union<T,S,A>::there_exist_unmarked_h_items_lighter_than_target(double threshold) const {
458
+ for (uint32_t i = 0; i < gadget_.h_; ++i) {
459
+ if ((gadget_.weights_[i] < threshold) && !gadget_.marks_[i]) {
460
+ return true;
461
+ }
462
+ }
463
+ return false;
464
+ }
465
+
466
+ template<typename T, typename S, typename A>
467
+ bool var_opt_union<T,S,A>::detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const {
468
+ // gadget is seemingly exact
469
+ const bool condition1 = gadget_.r_ == 0;
470
+
471
+ // but there are marked items in H, so only _pseudo_ exact
472
+ const bool condition2 = gadget_.num_marks_in_h_ > 0;
473
+
474
+ // if gadget is pseudo-exact and the number of marks equals outer_tau_denom, then we can deduce
475
+ // from the bookkeeping logic of resolve_tau() that all estimation mode input sketches must
476
+ // have had the same tau, so we can throw all of the marked items into a common reservoir.
477
+ const bool condition3 = gadget_.num_marks_in_h_ == outer_tau_denom_;
478
+
479
+ if (!(condition1 && condition2 && condition3)) {
480
+ return false;
481
+ } else {
482
+
483
+ // explicitly enforce rule that items in H should not be lighter than the sketch's tau
484
+ const bool anti_condition4 = there_exist_unmarked_h_items_lighter_than_target(gadget_.get_tau());
485
+ if (anti_condition4) {
486
+ return false;
487
+ } else {
488
+ // conditions 1 through 4 hold
489
+ mark_moving_gadget_coercer(sk);
490
+ return true;
491
+ }
492
+ }
493
+ }
494
+
495
+ /**
496
+ * This coercer directly transfers marked items from the gadget's H into the result's R.
497
+ * Deciding whether that is a valid thing to do is the responsibility of the caller. Currently,
498
+ * this is only used for a subcase of pseudo-exact, but later it might be used by other
499
+ * subcases as well.
500
+ *
501
+ * @param sk Copy of the gadget, modified with marked items moved to the reservoir
502
+ */
503
+ template<typename T, typename S, typename A>
504
+ void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const {
505
+ const uint32_t result_k = gadget_.h_ + gadget_.r_;
506
+
507
+ uint32_t result_h = 0;
508
+ uint32_t result_r = 0;
509
+ size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
510
+
511
+ typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
512
+ double* wts = AllocDouble().allocate(result_k + 1);
513
+ T* data = A().allocate(result_k + 1);
514
+
515
+ // insert R region items, ignoring weights
516
+ // Currently (May 2017) this next block is unreachable; this coercer is used only in the
517
+ // pseudo-exact case in which case there are no items natively in R, only marked items in H
518
+ // that will be moved into R as part of the coercion process.
519
+ // Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
520
+ const size_t final_idx = gadget_.get_num_samples();
521
+ for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
522
+ A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
523
+ wts[next_r_pos] = gadget_.weights_[idx];
524
+ ++result_r;
525
+ --next_r_pos;
526
+ }
527
+
528
+ double transferred_weight = 0;
529
+
530
+ // insert H region items
531
+ for (size_t idx = 0; idx < gadget_.h_; ++idx) {
532
+ if (gadget_.marks_[idx]) {
533
+ A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
534
+ wts[next_r_pos] = -1.0;
535
+ transferred_weight += gadget_.weights_[idx];
536
+ ++result_r;
537
+ --next_r_pos;
538
+ } else {
539
+ A().construct(&data[result_h], T(gadget_.data_[idx]));
540
+ wts[result_h] = gadget_.weights_[idx];
541
+ ++result_h;
542
+ }
543
+ }
544
+
545
+ if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
546
+ if (fabs(transferred_weight - outer_tau_numer_) > 1e-10) {
547
+ throw std::logic_error("uexpected mismatch in transferred weight");
548
+ }
549
+
550
+ const double result_r_weight = gadget_.total_wt_r_ + transferred_weight;
551
+ const uint64_t result_n = n_;
552
+
553
+ // explicitly set weight value for the gap
554
+ wts[result_h] = -1.0;
555
+
556
+ // clean up arrays in input sketch, replace with new values
557
+ typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
558
+ AllocBool().deallocate(sk.marks_, sk.curr_items_alloc_);
559
+ AllocDouble().deallocate(sk.weights_, sk.curr_items_alloc_);
560
+ for (size_t i = 0; i < result_k; ++i) { A().destroy(sk.data_ + i); } // assumes everything in H region, no gap
561
+ A().deallocate(sk.data_, sk.curr_items_alloc_);
562
+
563
+ sk.data_ = data;
564
+ sk.weights_ = wts;
565
+ sk.marks_ = nullptr;
566
+ sk.num_marks_in_h_ = 0;
567
+ sk.curr_items_alloc_ = result_k + 1;
568
+ sk.k_ = result_k;
569
+ sk.n_ = result_n;
570
+ sk.h_ = result_h;
571
+ sk.r_ = result_r;
572
+ sk.total_wt_r_ = result_r_weight;
573
+ }
574
+
575
+ // this is basically a continuation of get_result(), but modifying the input gadget copy
576
+ template<typename T, typename S, typename A>
577
+ void var_opt_union<T,S,A>::migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& gcopy) const {
578
+ const uint32_t r_count = gcopy.r_;
579
+ const uint32_t h_count = gcopy.h_;
580
+ const uint32_t k = gcopy.k_;
581
+
582
+ // should be ensured by caller
583
+ if (gcopy.num_marks_in_h_ == 0) throw std::logic_error("unexpectedly found no marked items to migrate");
584
+ // either full (of samples), in pseudo-exact mode, or both
585
+ if ((r_count != 0) && ((h_count + r_count) != k)) throw std::logic_error("invalid gadget state");
586
+
587
+ // if non-full and pseudo-exact, change k so that gcopy is full
588
+ if ((r_count == 0) && (h_count < k)) {
589
+ gcopy.k_ = h_count; // may leve extra space allocated but that's ok
590
+ }
591
+
592
+ // Now k equals the number of samples, so reducing k will increase tau.
593
+ // Also, we know that there are at least 2 samples because 0 or 1 would have been handled
594
+ // by the earlier logic in get_result()
595
+ gcopy.decrease_k_by_1();
596
+
597
+ // gcopy is now in estimation mode, just like the final result must be (due to marked items)
598
+ if (gcopy.get_tau() == 0.0) throw std::logic_error("gadget must be in sampling mode");
599
+
600
+ // keep reducing k until all marked items have been absorbed into the reservoir
601
+ while (gcopy.num_marks_in_h_ > 0) {
602
+ // gcopy.k_ >= 2 because h_ and r_ are both at least 1, but checked in next method anyway
603
+ gcopy.decrease_k_by_1();
604
+ }
605
+
606
+ gcopy.strip_marks();
607
+ }
608
+
609
+ template<typename T, typename S, typename A>
610
+ void var_opt_union<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
611
+ bool is_empty(flags & EMPTY_FLAG_MASK);
612
+
613
+ if (is_empty) {
614
+ if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
615
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
616
+ + std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
617
+ + std::to_string(preamble_longs));
618
+ }
619
+ } else {
620
+ if (preamble_longs != PREAMBLE_LONGS_NON_EMPTY) {
621
+ throw std::invalid_argument("Possible corruption: Preamble longs must be "
622
+ + std::to_string(PREAMBLE_LONGS_NON_EMPTY)
623
+ + " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
624
+ }
625
+ }
626
+ }
627
+
628
+ template<typename T, typename S, typename A>
629
+ void var_opt_union<T,S,A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
630
+ if (family_id == FAMILY_ID) {
631
+ if (ser_ver != SER_VER) {
632
+ throw std::invalid_argument("Possible corruption: VarOpt Union serialization version must be "
633
+ + std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));
634
+ }
635
+ return;
636
+ }
637
+ // TODO: extend to handle reservoir sampling
638
+
639
+ throw std::invalid_argument("Possible corruption: VarOpt Union family id must be "
640
+ + std::to_string(FAMILY_ID) + ". Found: " + std::to_string(family_id));
641
+ }
642
+
643
+ } // namespace datasketches
644
+
645
+ #endif // _VAR_OPT_UNION_IMPL_HPP_