datasketches 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,810 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CPC_SKETCH_IMPL_HPP_
21
+ #define CPC_SKETCH_IMPL_HPP_
22
+
23
+ #include <stdexcept>
24
+ #include <cmath>
25
+ #include <cstring>
26
+ #include <sstream>
27
+
28
+ #include "cpc_confidence.hpp"
29
+ #include "kxp_byte_lookup.hpp"
30
+ #include "inv_pow2_table.hpp"
31
+ #include "cpc_util.hpp"
32
+ #include "icon_estimator.hpp"
33
+ #include "serde.hpp"
34
+ #include "count_zeros.hpp"
35
+
36
+ namespace datasketches {
37
+
38
+ template<typename A>
39
+ void cpc_init() {
40
+ get_compressor<A>(); // this initializes a global static instance of the compressor on the first use
41
+ }
42
+
43
+ template<typename A>
44
+ cpc_sketch_alloc<A>::cpc_sketch_alloc(uint8_t lg_k, uint64_t seed):
45
+ lg_k(lg_k),
46
+ seed(seed),
47
+ was_merged(false),
48
+ num_coupons(0),
49
+ surprising_value_table(2, 6 + lg_k),
50
+ sliding_window(),
51
+ window_offset(0),
52
+ first_interesting_column(0),
53
+ kxp(1 << lg_k),
54
+ hip_est_accum(0)
55
+ {
56
+ if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
57
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
58
+ }
59
+ }
60
+
61
+ template<typename A>
62
+ uint8_t cpc_sketch_alloc<A>::get_lg_k() const {
63
+ return lg_k;
64
+ }
65
+
66
+ template<typename A>
67
+ bool cpc_sketch_alloc<A>::is_empty() const {
68
+ return num_coupons == 0;
69
+ }
70
+
71
+ template<typename A>
72
+ double cpc_sketch_alloc<A>::get_estimate() const {
73
+ if (!was_merged) return get_hip_estimate();
74
+ return get_icon_estimate();
75
+ }
76
+
77
+ template<typename A>
78
+ double cpc_sketch_alloc<A>::get_hip_estimate() const {
79
+ return hip_est_accum;
80
+ }
81
+
82
+ template<typename A>
83
+ double cpc_sketch_alloc<A>::get_icon_estimate() const {
84
+ return compute_icon_estimate(lg_k, num_coupons);
85
+ }
86
+
87
+ template<typename A>
88
+ double cpc_sketch_alloc<A>::get_lower_bound(unsigned kappa) const {
89
+ if (kappa < 1 || kappa > 3) {
90
+ throw std::invalid_argument("kappa must be 1, 2 or 3");
91
+ }
92
+ if (!was_merged) return get_hip_confidence_lb<A>(*this, kappa);
93
+ return get_icon_confidence_lb<A>(*this, kappa);
94
+ }
95
+
96
+ template<typename A>
97
+ double cpc_sketch_alloc<A>::get_upper_bound(unsigned kappa) const {
98
+ if (kappa < 1 || kappa > 3) {
99
+ throw std::invalid_argument("kappa must be 1, 2 or 3");
100
+ }
101
+ if (!was_merged) return get_hip_confidence_ub<A>(*this, kappa);
102
+ return get_icon_confidence_ub<A>(*this, kappa);
103
+ }
104
+
105
+ template<typename A>
106
+ void cpc_sketch_alloc<A>::update(const std::string& value) {
107
+ if (value.empty()) return;
108
+ update(value.c_str(), value.length());
109
+ }
110
+
111
+ template<typename A>
112
+ void cpc_sketch_alloc<A>::update(uint64_t value) {
113
+ update(&value, sizeof(value));
114
+ }
115
+
116
+ template<typename A>
117
+ void cpc_sketch_alloc<A>::update(int64_t value) {
118
+ update(&value, sizeof(value));
119
+ }
120
+
121
+ template<typename A>
122
+ void cpc_sketch_alloc<A>::update(uint32_t value) {
123
+ update(static_cast<int32_t>(value));
124
+ }
125
+
126
+ template<typename A>
127
+ void cpc_sketch_alloc<A>::update(int32_t value) {
128
+ update(static_cast<int64_t>(value));
129
+ }
130
+
131
+ template<typename A>
132
+ void cpc_sketch_alloc<A>::update(uint16_t value) {
133
+ update(static_cast<int16_t>(value));
134
+ }
135
+
136
+ template<typename A>
137
+ void cpc_sketch_alloc<A>::update(int16_t value) {
138
+ update(static_cast<int64_t>(value));
139
+ }
140
+
141
+ template<typename A>
142
+ void cpc_sketch_alloc<A>::update(uint8_t value) {
143
+ update(static_cast<int8_t>(value));
144
+ }
145
+
146
+ template<typename A>
147
+ void cpc_sketch_alloc<A>::update(int8_t value) {
148
+ update(static_cast<int64_t>(value));
149
+ }
150
+
151
+ template<typename A>
152
+ void cpc_sketch_alloc<A>::update(double value) {
153
+ union {
154
+ int64_t long_value;
155
+ double double_value;
156
+ } ldu;
157
+ if (value == 0.0) {
158
+ ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
159
+ } else if (std::isnan(value)) {
160
+ ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
161
+ } else {
162
+ ldu.double_value = value;
163
+ }
164
+ update(&ldu, sizeof(ldu));
165
+ }
166
+
167
+ template<typename A>
168
+ void cpc_sketch_alloc<A>::update(float value) {
169
+ update(static_cast<double>(value));
170
+ }
171
+
172
+ static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
173
+ if (lg_k > 26) throw std::logic_error("lg_k > 26");
174
+ const uint64_t k = 1 << lg_k;
175
+ uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
176
+ if (col > 63) col = 63; // clip so that 0 <= col <= 63
177
+ const uint32_t row = hash0 & (k - 1);
178
+ uint32_t row_col = (row << 6) | col;
179
+ // To avoid the hash table's "empty" value, we change the row of the following pair.
180
+ // This case is extremely unlikely, but we might as well handle it.
181
+ if (row_col == UINT32_MAX) row_col ^= 1 << 6;
182
+ return row_col;
183
+ }
184
+
185
+ template<typename A>
186
+ void cpc_sketch_alloc<A>::update(const void* value, int size) {
187
+ HashState hashes;
188
+ MurmurHash3_x64_128(value, size, seed, hashes);
189
+ row_col_update(row_col_from_two_hashes(hashes.h1, hashes.h2, lg_k));
190
+ }
191
+
192
+ template<typename A>
193
+ void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
194
+ const uint8_t col = row_col & 63;
195
+ if (col < first_interesting_column) return; // important speed optimization
196
+ // window size is 0 until sketch is promoted from sparse to windowed
197
+ if (sliding_window.size() == 0) {
198
+ update_sparse(row_col);
199
+ } else {
200
+ update_windowed(row_col);
201
+ }
202
+ }
203
+
204
+ template<typename A>
205
+ void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
206
+ const uint64_t k = 1 << lg_k;
207
+ const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
208
+ if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
209
+ bool is_novel = surprising_value_table.maybe_insert(row_col);
210
+ if (is_novel) {
211
+ num_coupons++;
212
+ update_hip(row_col);
213
+ const uint64_t c32post = static_cast<uint64_t>(num_coupons) << 5;
214
+ if (c32post >= 3 * k) promote_sparse_to_windowed(); // C >= 3K/32
215
+ }
216
+ }
217
+
218
+ // the flavor is HYBRID, PINNED, or SLIDING
219
+ template<typename A>
220
+ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
221
+ if (window_offset > 56) throw std::logic_error("wrong window offset");
222
+ const uint64_t k = 1 << lg_k;
223
+ const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
224
+ if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
225
+ const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
226
+ const uint64_t w8pre = static_cast<uint64_t>(window_offset) << 3;
227
+ if (c8pre >= (27 + w8pre) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset)
228
+
229
+ bool is_novel = false;
230
+ const uint8_t col = row_col & 63;
231
+
232
+ if (col < window_offset) { // track the surprising 0's "before" the window
233
+ is_novel = surprising_value_table.maybe_delete(row_col); // inverted logic
234
+ } else if (col < window_offset + 8) { // track the 8 bits inside the window
235
+ if (col < window_offset) throw std::logic_error("col < window_offset");
236
+ const uint32_t row = row_col >> 6;
237
+ const uint8_t old_bits = sliding_window[row];
238
+ const uint8_t new_bits = old_bits | (1 << (col - window_offset));
239
+ if (new_bits != old_bits) {
240
+ sliding_window[row] = new_bits;
241
+ is_novel = true;
242
+ }
243
+ } else { // track the surprising 1's "after" the window
244
+ if (col < window_offset + 8) throw std::logic_error("col < window_offset + 8");
245
+ is_novel = surprising_value_table.maybe_insert(row_col); // normal logic
246
+ }
247
+
248
+ if (is_novel) {
249
+ num_coupons++;
250
+ update_hip(row_col);
251
+ const uint64_t c8post = static_cast<uint64_t>(num_coupons) << 3;
252
+ if (c8post >= (27 + w8pre) * k) {
253
+ move_window();
254
+ if (window_offset < 1 || window_offset > 56) throw std::logic_error("wrong window offset");
255
+ const uint64_t w8post = static_cast<uint64_t>(window_offset) << 3;
256
+ if (c8post >= (27 + w8post) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset)
257
+ }
258
+ }
259
+ }
260
+
261
+ // Call this whenever a new coupon has been collected.
262
+ template<typename A>
263
+ void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
264
+ const uint64_t k = 1 << lg_k;
265
+ const uint8_t col = row_col & 63;
266
+ const double one_over_p = static_cast<double>(k) / kxp;
267
+ hip_est_accum += one_over_p;
268
+ kxp -= INVERSE_POWERS_OF_2[col + 1]; // notice the "+1"
269
+ }
270
+
271
+ // In terms of flavor, this promotes SPARSE to HYBRID
272
+ template<typename A>
273
+ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
274
+ const uint64_t k = 1 << lg_k;
275
+ const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
276
+ if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
277
+
278
+ sliding_window.resize(k, 0); // zero the memory (because we will be OR'ing into it)
279
+
280
+ u32_table<A> new_table(2, 6 + lg_k);
281
+
282
+ const uint32_t* old_slots = surprising_value_table.get_slots();
283
+ const size_t old_num_slots = 1 << surprising_value_table.get_lg_size();
284
+
285
+ if (window_offset != 0) throw std::logic_error("window_offset != 0");
286
+
287
+ for (size_t i = 0; i < old_num_slots; i++) {
288
+ const uint32_t row_col = old_slots[i];
289
+ if (row_col != UINT32_MAX) {
290
+ const uint8_t col = row_col & 63;
291
+ if (col < 8) {
292
+ const size_t row = row_col >> 6;
293
+ sliding_window[row] |= 1 << col;
294
+ } else {
295
+ // cannot use u32_table::must_insert(), because it doesn't provide for growth
296
+ const bool is_novel = new_table.maybe_insert(row_col);
297
+ if (!is_novel) throw std::logic_error("is_novel != true");
298
+ }
299
+ }
300
+ }
301
+
302
+ surprising_value_table = std::move(new_table);
303
+ }
304
+
305
+ template<typename A>
306
+ void cpc_sketch_alloc<A>::move_window() {
307
+ const uint8_t new_offset = window_offset + 1;
308
+ if (new_offset > 56) throw std::logic_error("new_offset > 56");
309
+ if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
310
+
311
+ if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
312
+ const uint64_t k = 1 << lg_k;
313
+
314
+ // Construct the full-sized bit matrix that corresponds to the sketch
315
+ vector_u64<A> bit_matrix = build_bit_matrix();
316
+
317
+ // refresh the KXP register on every 8th window shift.
318
+ if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data());
319
+
320
+ surprising_value_table.clear(); // the new number of surprises will be about the same
321
+
322
+ const uint64_t mask_for_clearing_window = (static_cast<uint64_t>(0xff) << new_offset) ^ UINT64_MAX;
323
+ const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << new_offset) - 1;
324
+ uint64_t all_surprises_ored = 0;
325
+
326
+ for (size_t i = 0; i < k; i++) {
327
+ uint64_t pattern = bit_matrix[i];
328
+ sliding_window[i] = (pattern >> new_offset) & 0xff;
329
+ pattern &= mask_for_clearing_window;
330
+ // The following line converts surprising 0's to 1's in the "early zone",
331
+ // (and vice versa, which is essential for this procedure's O(k) time cost).
332
+ pattern ^= mask_for_flipping_early_zone;
333
+ all_surprises_ored |= pattern; // a cheap way to recalculate first_interesting_column
334
+ while (pattern != 0) {
335
+ const uint8_t col = count_trailing_zeros_in_u64(pattern);
336
+ pattern = pattern ^ (static_cast<uint64_t>(1) << col); // erase the 1
337
+ const uint32_t row_col = (i << 6) | col;
338
+ const bool is_novel = surprising_value_table.maybe_insert(row_col);
339
+ if (!is_novel) throw std::logic_error("is_novel != true");
340
+ }
341
+ }
342
+
343
+ window_offset = new_offset;
344
+
345
+ first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored);
346
+ if (first_interesting_column > new_offset) first_interesting_column = new_offset; // corner case
347
+ }
348
+
349
+ // The KXP register is a double with roughly 50 bits of precision, but
350
+ // it might need roughly 90 bits to track the value with perfect accuracy.
351
+ // Therefore we recalculate KXP occasionally from the sketch's full bitmatrix
352
+ // so that it will reflect changes that were previously outside the mantissa.
353
+ template<typename A>
354
+ void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
355
+ const uint64_t k = 1 << lg_k;
356
+
357
+ // for improved numerical accuracy, we separately sum the bytes of the U64's
358
+ double byte_sums[8]; // allocating on the stack
359
+ std::fill(byte_sums, &byte_sums[8], 0);
360
+
361
+ for (size_t i = 0; i < k; i++) {
362
+ uint64_t word = bit_matrix[i];
363
+ for (unsigned j = 0; j < 8; j++) {
364
+ const uint8_t byte = word & 0xff;
365
+ byte_sums[j] += KXP_BYTE_TABLE[byte];
366
+ word >>= 8;
367
+ }
368
+ }
369
+
370
+ double total = 0.0;
371
+ for (int j = 7; j >= 0; j--) { // the reverse order is important
372
+ const double factor = INVERSE_POWERS_OF_2[8 * j]; // pow (256.0, (-1.0 * ((double) j)));
373
+ total += factor * byte_sums[j];
374
+ }
375
+
376
+ kxp = total;
377
+ }
378
+
379
+ template<typename A>
380
+ string<A> cpc_sketch_alloc<A>::to_string() const {
381
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
382
+ os << "### CPC sketch summary:" << std::endl;
383
+ os << " lg_k : " << std::to_string(lg_k) << std::endl;
384
+ os << " seed hash : " << std::hex << compute_seed_hash(seed) << std::dec << std::endl;
385
+ os << " C : " << num_coupons << std::endl;
386
+ os << " flavor : " << determine_flavor() << std::endl;
387
+ os << " merged : " << (was_merged ? "true" : "false") << std::endl;
388
+ if (!was_merged) {
389
+ os << " HIP estimate : " << hip_est_accum << std::endl;
390
+ os << " kxp : " << kxp << std::endl;
391
+ }
392
+ os << " intresting col : " << std::to_string(first_interesting_column) << std::endl;
393
+ os << " table entries : " << surprising_value_table.get_num_items() << std::endl;
394
+ os << " window : " << (sliding_window.size() == 0 ? "not " : "") << "allocated" << std::endl;
395
+ if (sliding_window.size() > 0) {
396
+ os << " window offset : " << std::to_string(window_offset) << std::endl;
397
+ }
398
+ os << "### End sketch summary" << std::endl;
399
+ return os.str();
400
+ }
401
+
402
+ template<typename A>
403
+ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
404
+ compressed_state<A> compressed;
405
+ compressed.table_data_words = 0;
406
+ compressed.table_num_entries = 0;
407
+ compressed.window_data_words = 0;
408
+ get_compressor<A>().compress(*this, compressed);
409
+ const bool has_hip = !was_merged;
410
+ const bool has_table = compressed.table_data.size() > 0;
411
+ const bool has_window = compressed.window_data.size() > 0;
412
+ const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
413
+ os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
414
+ const uint8_t serial_version = SERIAL_VERSION;
415
+ os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
416
+ const uint8_t family = FAMILY;
417
+ os.write(reinterpret_cast<const char*>(&family), sizeof(family));
418
+ os.write(reinterpret_cast<const char*>(&lg_k), sizeof(lg_k));
419
+ os.write(reinterpret_cast<const char*>(&first_interesting_column), sizeof(first_interesting_column));
420
+ const uint8_t flags_byte(
421
+ (1 << flags::IS_COMPRESSED)
422
+ | (has_hip ? 1 << flags::HAS_HIP : 0)
423
+ | (has_table ? 1 << flags::HAS_TABLE : 0)
424
+ | (has_window ? 1 << flags::HAS_WINDOW : 0)
425
+ );
426
+ os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
427
+ const uint16_t seed_hash(compute_seed_hash(seed));
428
+ os.write((char*)&seed_hash, sizeof(seed_hash));
429
+ if (!is_empty()) {
430
+ os.write((char*)&num_coupons, sizeof(num_coupons));
431
+ if (has_table && has_window) {
432
+ // if there is no window it is the same as number of coupons
433
+ os.write((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
434
+ // HIP values can be in two different places in the sequence of fields
435
+ // this is the first HIP decision point
436
+ if (has_hip) write_hip(os);
437
+ }
438
+ if (has_table) {
439
+ os.write((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
440
+ }
441
+ if (has_window) {
442
+ os.write((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
443
+ }
444
+ // this is the second HIP decision point
445
+ if (has_hip && !(has_table && has_window)) write_hip(os);
446
+ if (has_window) {
447
+ os.write((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
448
+ }
449
+ if (has_table) {
450
+ os.write((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
451
+ }
452
+ }
453
+ }
454
+
455
+ template<typename A>
456
+ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
457
+ compressed_state<A> compressed;
458
+ compressed.table_data_words = 0;
459
+ compressed.table_num_entries = 0;
460
+ compressed.window_data_words = 0;
461
+ get_compressor<A>().compress(*this, compressed);
462
+ const bool has_hip = !was_merged;
463
+ const bool has_table = compressed.table_data.size() > 0;
464
+ const bool has_window = compressed.window_data.size() > 0;
465
+ const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
466
+ const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
467
+ vector_u8<A> bytes(size);
468
+ uint8_t* ptr = bytes.data() + header_size_bytes;
469
+ ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
470
+ const uint8_t serial_version = SERIAL_VERSION;
471
+ ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
472
+ const uint8_t family = FAMILY;
473
+ ptr += copy_to_mem(&family, ptr, sizeof(family));
474
+ ptr += copy_to_mem(&lg_k, ptr, sizeof(lg_k));
475
+ ptr += copy_to_mem(&first_interesting_column, ptr, sizeof(first_interesting_column));
476
+ const uint8_t flags_byte(
477
+ (1 << flags::IS_COMPRESSED)
478
+ | (has_hip ? 1 << flags::HAS_HIP : 0)
479
+ | (has_table ? 1 << flags::HAS_TABLE : 0)
480
+ | (has_window ? 1 << flags::HAS_WINDOW : 0)
481
+ );
482
+ ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
483
+ const uint16_t seed_hash = compute_seed_hash(seed);
484
+ ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
485
+ if (!is_empty()) {
486
+ ptr += copy_to_mem(&num_coupons, ptr, sizeof(num_coupons));
487
+ if (has_table && has_window) {
488
+ // if there is no window it is the same as number of coupons
489
+ ptr += copy_to_mem(&compressed.table_num_entries, ptr, sizeof(compressed.table_num_entries));
490
+ // HIP values can be in two different places in the sequence of fields
491
+ // this is the first HIP decision point
492
+ if (has_hip) ptr += copy_hip_to_mem(ptr);
493
+ }
494
+ if (has_table) {
495
+ ptr += copy_to_mem(&compressed.table_data_words, ptr, sizeof(compressed.table_data_words));
496
+ }
497
+ if (has_window) {
498
+ ptr += copy_to_mem(&compressed.window_data_words, ptr, sizeof(compressed.window_data_words));
499
+ }
500
+ // this is the second HIP decision point
501
+ if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
502
+ if (has_window) {
503
+ ptr += copy_to_mem(compressed.window_data.data(), ptr, compressed.window_data_words * sizeof(uint32_t));
504
+ }
505
+ if (has_table) {
506
+ ptr += copy_to_mem(compressed.table_data.data(), ptr, compressed.table_data_words * sizeof(uint32_t));
507
+ }
508
+ }
509
+ if (ptr != bytes.data() + size) throw std::logic_error("serialized size mismatch");
510
+ return bytes;
511
+ }
512
+
513
+ template<typename A>
514
+ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
515
+ uint8_t preamble_ints;
516
+ is.read((char*)&preamble_ints, sizeof(preamble_ints));
517
+ uint8_t serial_version;
518
+ is.read((char*)&serial_version, sizeof(serial_version));
519
+ uint8_t family_id;
520
+ is.read((char*)&family_id, sizeof(family_id));
521
+ uint8_t lg_k;
522
+ is.read((char*)&lg_k, sizeof(lg_k));
523
+ uint8_t first_interesting_column;
524
+ is.read((char*)&first_interesting_column, sizeof(first_interesting_column));
525
+ uint8_t flags_byte;
526
+ is.read((char*)&flags_byte, sizeof(flags_byte));
527
+ uint16_t seed_hash;
528
+ is.read((char*)&seed_hash, sizeof(seed_hash));
529
+ const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
530
+ const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
531
+ const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
532
+ compressed_state<A> compressed;
533
+ compressed.table_data_words = 0;
534
+ compressed.table_num_entries = 0;
535
+ compressed.window_data_words = 0;
536
+ uint32_t num_coupons = 0;
537
+ double kxp = 0;
538
+ double hip_est_accum = 0;
539
+ if (has_table || has_window) {
540
+ is.read((char*)&num_coupons, sizeof(num_coupons));
541
+ if (has_table && has_window) {
542
+ is.read((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
543
+ if (has_hip) {
544
+ is.read((char*)&kxp, sizeof(kxp));
545
+ is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
546
+ }
547
+ }
548
+ if (has_table) {
549
+ is.read((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
550
+ }
551
+ if (has_window) {
552
+ is.read((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
553
+ }
554
+ if (has_hip && !(has_table && has_window)) {
555
+ is.read((char*)&kxp, sizeof(kxp));
556
+ is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
557
+ }
558
+ if (has_window) {
559
+ compressed.window_data.resize(compressed.window_data_words);
560
+ is.read((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
561
+ }
562
+ if (has_table) {
563
+ compressed.table_data.resize(compressed.table_data_words);
564
+ is.read((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
565
+ }
566
+ if (!has_window) compressed.table_num_entries = num_coupons;
567
+ }
568
+
569
+ uint8_t expected_preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
570
+ if (preamble_ints != expected_preamble_ints) {
571
+ throw std::invalid_argument("Possible corruption: preamble ints: expected "
572
+ + std::to_string(expected_preamble_ints) + ", got " + std::to_string(preamble_ints));
573
+ }
574
+ if (serial_version != SERIAL_VERSION) {
575
+ throw std::invalid_argument("Possible corruption: serial version: expected "
576
+ + std::to_string(SERIAL_VERSION) + ", got " + std::to_string(serial_version));
577
+ }
578
+ if (family_id != FAMILY) {
579
+ throw std::invalid_argument("Possible corruption: family: expected "
580
+ + std::to_string(FAMILY) + ", got " + std::to_string(family_id));
581
+ }
582
+ if (seed_hash != compute_seed_hash(seed)) {
583
+ throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
584
+ + std::to_string(compute_seed_hash(seed)));
585
+ }
586
+ uncompressed_state<A> uncompressed;
587
+ get_compressor<A>().uncompress(compressed, uncompressed, lg_k, num_coupons);
588
+ if (!is.good())
589
+ throw std::runtime_error("error reading from std::istream");
590
+ return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(uncompressed.table),
591
+ std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
592
+ }
593
+
594
+ template<typename A>
595
+ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
596
+ ensure_minimum_memory(size, 8);
597
+ const char* ptr = static_cast<const char*>(bytes);
598
+ const char* base = static_cast<const char*>(bytes);
599
+ uint8_t preamble_ints;
600
+ ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
601
+ uint8_t serial_version;
602
+ ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
603
+ uint8_t family_id;
604
+ ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
605
+ uint8_t lg_k;
606
+ ptr += copy_from_mem(ptr, &lg_k, sizeof(lg_k));
607
+ uint8_t first_interesting_column;
608
+ ptr += copy_from_mem(ptr, &first_interesting_column, sizeof(first_interesting_column));
609
+ uint8_t flags_byte;
610
+ ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
611
+ uint16_t seed_hash;
612
+ ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
613
+ const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
614
+ const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
615
+ const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
616
+ ensure_minimum_memory(size, preamble_ints << 2);
617
+ compressed_state<A> compressed;
618
+ compressed.table_data_words = 0;
619
+ compressed.table_num_entries = 0;
620
+ compressed.window_data_words = 0;
621
+ uint32_t num_coupons = 0;
622
+ double kxp = 0;
623
+ double hip_est_accum = 0;
624
+ if (has_table || has_window) {
625
+ check_memory_size(ptr - base + sizeof(num_coupons), size);
626
+ ptr += copy_from_mem(ptr, &num_coupons, sizeof(num_coupons));
627
+ if (has_table && has_window) {
628
+ check_memory_size(ptr - base + sizeof(compressed.table_num_entries), size);
629
+ ptr += copy_from_mem(ptr, &compressed.table_num_entries, sizeof(compressed.table_num_entries));
630
+ if (has_hip) {
631
+ check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
632
+ ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
633
+ ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
634
+ }
635
+ }
636
+ if (has_table) {
637
+ check_memory_size(ptr - base + sizeof(compressed.table_data_words), size);
638
+ ptr += copy_from_mem(ptr, &compressed.table_data_words, sizeof(compressed.table_data_words));
639
+ }
640
+ if (has_window) {
641
+ check_memory_size(ptr - base + sizeof(compressed.window_data_words), size);
642
+ ptr += copy_from_mem(ptr, &compressed.window_data_words, sizeof(compressed.window_data_words));
643
+ }
644
+ if (has_hip && !(has_table && has_window)) {
645
+ check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
646
+ ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
647
+ ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
648
+ }
649
+ if (has_window) {
650
+ compressed.window_data.resize(compressed.window_data_words);
651
+ check_memory_size(ptr - base + (compressed.window_data_words * sizeof(uint32_t)), size);
652
+ ptr += copy_from_mem(ptr, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
653
+ }
654
+ if (has_table) {
655
+ compressed.table_data.resize(compressed.table_data_words);
656
+ check_memory_size(ptr - base + (compressed.table_data_words * sizeof(uint32_t)), size);
657
+ ptr += copy_from_mem(ptr, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
658
+ }
659
+ if (!has_window) compressed.table_num_entries = num_coupons;
660
+ }
661
+ if (ptr != static_cast<const char*>(bytes) + size) throw std::logic_error("deserialized size mismatch");
662
+
663
+ uint8_t expected_preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
664
+ if (preamble_ints != expected_preamble_ints) {
665
+ throw std::invalid_argument("Possible corruption: preamble ints: expected "
666
+ + std::to_string(expected_preamble_ints) + ", got " + std::to_string(preamble_ints));
667
+ }
668
+ if (serial_version != SERIAL_VERSION) {
669
+ throw std::invalid_argument("Possible corruption: serial version: expected "
670
+ + std::to_string(SERIAL_VERSION) + ", got " + std::to_string(serial_version));
671
+ }
672
+ if (family_id != FAMILY) {
673
+ throw std::invalid_argument("Possible corruption: family: expected "
674
+ + std::to_string(FAMILY) + ", got " + std::to_string(family_id));
675
+ }
676
+ if (seed_hash != compute_seed_hash(seed)) {
677
+ throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
678
+ + std::to_string(compute_seed_hash(seed)));
679
+ }
680
+ uncompressed_state<A> uncompressed;
681
+ get_compressor<A>().uncompress(compressed, uncompressed, lg_k, num_coupons);
682
+ return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(uncompressed.table),
683
+ std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
684
+ }
685
+
686
+ template<typename A>
687
+ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
688
+ return num_coupons;
689
+ }
690
+
691
+ template<typename A>
692
+ bool cpc_sketch_alloc<A>::validate() const {
693
+ vector_u64<A> bit_matrix = build_bit_matrix();
694
+ const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1 << lg_k);
695
+ return num_bits_set == num_coupons;
696
+ }
697
+
698
+ template<typename A>
699
+ cpc_sketch_alloc<A>::cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column,
700
+ u32_table<A>&& table, vector_u8<A>&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed):
701
+ lg_k(lg_k),
702
+ seed(seed),
703
+ was_merged(!has_hip),
704
+ num_coupons(num_coupons),
705
+ surprising_value_table(std::move(table)),
706
+ sliding_window(std::move(window)),
707
+ window_offset(determine_correct_offset(lg_k, num_coupons)),
708
+ first_interesting_column(first_interesting_column),
709
+ kxp(kxp),
710
+ hip_est_accum(hip_est_accum)
711
+ {}
712
+
713
+ template<typename A>
714
+ uint8_t cpc_sketch_alloc<A>::get_preamble_ints(uint32_t num_coupons, bool has_hip, bool has_table, bool has_window) {
715
+ uint8_t preamble_ints = 2;
716
+ if (num_coupons > 0) {
717
+ preamble_ints += 1; // number of coupons
718
+ if (has_hip) {
719
+ preamble_ints += 4; // HIP
720
+ }
721
+ if (has_table) {
722
+ preamble_ints += 1; // table data length
723
+ // number of values (if there is no window it is the same as number of coupons)
724
+ if (has_window) {
725
+ preamble_ints += 1;
726
+ }
727
+ }
728
+ if (has_window) {
729
+ preamble_ints += 1; // window length
730
+ }
731
+ }
732
+ return preamble_ints;
733
+ }
734
+
735
+ template<typename A>
736
+ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor() const {
737
+ return determine_flavor(lg_k, num_coupons);
738
+ }
739
+
740
+ template<typename A>
741
+ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8_t lg_k, uint64_t c) {
742
+ const uint64_t k = 1 << lg_k;
743
+ const uint64_t c2 = c << 1;
744
+ const uint64_t c8 = c << 3;
745
+ const uint64_t c32 = c << 5;
746
+ if (c == 0) return EMPTY; // 0 == C < 1
747
+ if (c32 < 3 * k) return SPARSE; // 1 <= C < 3K/32
748
+ if (c2 < k) return HYBRID; // 3K/32 <= C < K/2
749
+ if (c8 < 27 * k) return PINNED; // K/2 <= C < 27K/8
750
+ else return SLIDING; // 27K/8 <= C
751
+ }
752
+
753
+ template<typename A>
754
+ uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c) {
755
+ const uint64_t k = 1 << lg_k;
756
+ const int64_t tmp = static_cast<int64_t>(c << 3) - static_cast<int64_t>(19 * k); // 8C - 19K
757
+ if (tmp < 0) return 0;
758
+ return tmp >> (lg_k + 3); // tmp / 8K
759
+ }
760
+
761
+ template<typename A>
762
+ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
763
+ const size_t k = 1 << lg_k;
764
+ if (window_offset > 56) throw std::logic_error("offset > 56");
765
+
766
+ // Fill the matrix with default rows in which the "early zone" is filled with ones.
767
+ // This is essential for the routine's O(k) time cost (as opposed to O(C)).
768
+ const uint64_t default_row = (static_cast<uint64_t>(1) << window_offset) - 1;
769
+ vector_u64<A> matrix(k, default_row);
770
+
771
+ if (num_coupons == 0) return matrix;
772
+
773
+ if (sliding_window.size() > 0) { // In other words, we are in window mode, not sparse mode
774
+ for (size_t i = 0; i < k; i++) { // set the window bits, trusting the sketch's current offset
775
+ matrix[i] |= static_cast<uint64_t>(sliding_window[i]) << window_offset;
776
+ }
777
+ }
778
+
779
+ const uint32_t* slots = surprising_value_table.get_slots();
780
+ const size_t num_slots = 1 << surprising_value_table.get_lg_size();
781
+ for (size_t i = 0; i < num_slots; i++) {
782
+ const uint32_t row_col = slots[i];
783
+ if (row_col != UINT32_MAX) {
784
+ const uint8_t col = row_col & 63;
785
+ const size_t row = row_col >> 6;
786
+ // Flip the specified matrix bit from its default value.
787
+ // In the "early" zone the bit changes from 1 to 0.
788
+ // In the "late" zone the bit changes from 0 to 1.
789
+ matrix[row] ^= static_cast<uint64_t>(1) << col;
790
+ }
791
+ }
792
+ return matrix;
793
+ }
794
+
795
+ template<typename A>
796
+ void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
797
+ os.write(reinterpret_cast<const char*>(&kxp), sizeof(kxp));
798
+ os.write(reinterpret_cast<const char*>(&hip_est_accum), sizeof(hip_est_accum));
799
+ }
800
+
801
+ template<typename A>
802
+ size_t cpc_sketch_alloc<A>::copy_hip_to_mem(void* dst) const {
803
+ memcpy(dst, &kxp, sizeof(kxp));
804
+ memcpy(static_cast<char*>(dst) + sizeof(kxp), &hip_est_accum, sizeof(hip_est_accum));
805
+ return sizeof(kxp) + sizeof(hip_est_accum);
806
+ }
807
+
808
+ } /* namespace datasketches */
809
+
810
+ #endif