datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,313 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+ #include <sstream>
22
+
23
+ #include "hll.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ static int min(int a, int b) {
28
+ return (a < b) ? a : b;
29
+ }
30
+
31
+ static void println(std::string& str) {
32
+ //std::cout << str << "\n";
33
+ }
34
+
35
+ static void basicUnion(uint64_t n1, uint64_t n2,
36
+ uint64_t lgk1, uint64_t lgk2, uint64_t lgMaxK,
37
+ target_hll_type type1, target_hll_type type2, target_hll_type resultType) {
38
+ uint64_t v = 0;
39
+ //int tot = n1 + n2;
40
+
41
+ hll_sketch h1(lgk1, type1);
42
+ hll_sketch h2(lgk2, type2);
43
+ int lgControlK = min(min(lgk1, lgk2), lgMaxK);
44
+ hll_sketch control(lgControlK, resultType);
45
+
46
+ for (uint64_t i = 0; i < n1; ++i) {
47
+ h1.update(v + i);
48
+ control.update(v + i);
49
+ }
50
+ v += n1;
51
+ for (uint64_t i = 0; i < n2; ++i) {
52
+ h2.update(v + i);
53
+ control.update(v + i);
54
+ }
55
+ v += n2;
56
+
57
+ hll_union u(lgMaxK);
58
+ u.update(std::move(h1));
59
+ u.update(h2);
60
+
61
+ hll_sketch result = u.get_result(resultType);
62
+
63
+ // force non-HIP estimates to avoid issues with in- vs out-of-order
64
+ double uEst = result.get_composite_estimate();
65
+ double uUb = result.get_upper_bound(2);
66
+ double uLb = result.get_lower_bound(2);
67
+ //double rerr = ((uEst/tot) - 1.0) * 100;
68
+
69
+ double controlEst = control.get_composite_estimate();
70
+ double controlUb = control.get_upper_bound(2);
71
+ double controlLb = control.get_lower_bound(2);
72
+
73
+ REQUIRE((controlUb - controlEst) >= 0.0);
74
+ REQUIRE((uUb - uEst) >= 0.0);
75
+ REQUIRE((controlEst - controlLb) >= 0.0);
76
+ REQUIRE((uEst - uLb) >= 0.0);
77
+
78
+ REQUIRE(controlEst == uEst);
79
+ }
80
+
81
+ /**
82
+ * The task here is to check the transition boundaries as the sketch morphs between LIST to
83
+ * SET to HLL modes. The transition points vary as a function of lgConfigK. In addition,
84
+ * this checks that the union operation is operating properly based on the order the
85
+ * sketches are presented to the union.
86
+ */
87
+ TEST_CASE("hll union: check unions", "[hll_union]") {
88
+ target_hll_type type1 = HLL_8;
89
+ target_hll_type type2 = HLL_8;
90
+ target_hll_type resultType = HLL_8;
91
+
92
+ uint64_t lgK1 = 7;
93
+ uint64_t lgK2 = 7;
94
+ uint64_t lgMaxK = 7;
95
+ uint64_t n1 = 7;
96
+ uint64_t n2 = 7;
97
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
98
+ n1 = 8;
99
+ n2 = 7;
100
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
101
+ n1 = 7;
102
+ n2 = 8;
103
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
104
+ n1 = 8;
105
+ n2 = 8;
106
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
107
+ n1 = 7;
108
+ n2 = 14;
109
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
110
+
111
+ int i = 0;
112
+ for (i = 7; i <= 13; ++i) {
113
+ lgK1 = i;
114
+ lgK2 = i;
115
+ lgMaxK = i;
116
+ {
117
+ n1 = ((1 << (i - 3)) * 3)/4; // compute the transition point
118
+ n2 = n1;
119
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
120
+ n1 += 2;
121
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
122
+ n1 -= 2;
123
+ n2 += 2;
124
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
125
+ n1 += 2;
126
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
127
+ }
128
+ lgK1 = i;
129
+ lgK2 = i + 1;
130
+ lgMaxK = i;
131
+ {
132
+ n1 = ((1 << (i - 3)) * 3)/4;
133
+ n2 = n1;
134
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
135
+ n1 += 2;
136
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
137
+ n1 -= 2;
138
+ n2 += 2;
139
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
140
+ n1 += 2;
141
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
142
+ }
143
+ lgK1 = i + 1;
144
+ lgK2 = i;
145
+ lgMaxK = i;
146
+ {
147
+ n1 = ((1 << (i - 3)) * 3)/4;
148
+ n2 = n1;
149
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
150
+ n1 += 2;
151
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
152
+ n1 -= 2;
153
+ n2 += 2;
154
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
155
+ n1 += 2;
156
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
157
+ }
158
+ lgK1 = i + 1;
159
+ lgK2 = i + 1;
160
+ lgMaxK = i;
161
+ {
162
+ n1 = ((1 << (i - 3)) * 3)/4;
163
+ n2 = n1;
164
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
165
+ n1 += 2;
166
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
167
+ n1 -= 2;
168
+ n2 += 2;
169
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
170
+ n1 += 2;
171
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
172
+ }
173
+ }
174
+ }
175
+
176
+ TEST_CASE("hll union: check composite estimate", "[hll_union]") {
177
+ hll_union u(12);
178
+ REQUIRE(u.is_empty());
179
+ REQUIRE(u.get_composite_estimate() == Approx(0.0).margin(0.03));
180
+ for (int i = 1; i <= 15; ++i) { u.update(i); }
181
+ REQUIRE(u.get_composite_estimate() == Approx(15.0).margin(15 * 0.03));
182
+ for (int i = 16; i <= 1000; ++i) { u.update(i); }
183
+ REQUIRE(u.get_composite_estimate() == Approx(1000.0).margin(1000 * 0.03));
184
+ }
185
+
186
+ TEST_CASE("hll union: check config k limits", "[hll_union]") {
187
+ REQUIRE_THROWS_AS(hll_union(HllUtil<>::MIN_LOG_K - 1), std::invalid_argument);
188
+
189
+ REQUIRE_THROWS_AS(hll_union(HllUtil<>::MAX_LOG_K + 1), std::invalid_argument);
190
+ }
191
+
192
+ static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est) {
193
+ double re = RelativeErrorTables<>::getRelErr(ub, oooFlag, lgK, numStdDev);
194
+ return est / (1.0 + re);
195
+ }
196
+
197
+ TEST_CASE("hll union: check ub lb", "[hll_union]") {
198
+ int lgK = 4;
199
+ int n = 1 << 20;
200
+ bool oooFlag = false;
201
+
202
+ double bound;
203
+ std::string str;
204
+
205
+ bound = (getBound(lgK, true, oooFlag, 3, n) / n) - 1;
206
+ str = "LgK=" + std::to_string(lgK) + ", UB3: " + std::to_string(bound);
207
+ println(str);
208
+ bound = (getBound(lgK, true, oooFlag, 2, n) / n) - 1;
209
+ str = "LgK=" + std::to_string(lgK) + ", UB2: " + std::to_string(bound);
210
+ println(str);
211
+ bound = (getBound(lgK, true, oooFlag, 1, n) / n) - 1;
212
+ str = "LgK=" + std::to_string(lgK) + ", UB1: " + std::to_string(bound);
213
+ println(str);
214
+ bound = (getBound(lgK, false, oooFlag, 1, n) / n) - 1;
215
+ str = "LgK=" + std::to_string(lgK) + ", LB1: " + std::to_string(bound);
216
+ println(str);
217
+ bound = (getBound(lgK, false, oooFlag, 2, n) / n) - 1;
218
+ str = "LgK=" + std::to_string(lgK) + ", LB2: " + std::to_string(bound);
219
+ println(str);
220
+ bound = (getBound(lgK, false, oooFlag, 3, n) / n) - 1;
221
+ str = "LgK=" + std::to_string(lgK) + ", LB3: " + std::to_string(bound);
222
+ println(str);
223
+ }
224
+
225
+ TEST_CASE("hll union: check conversions", "[hll_union]") {
226
+ int lgK = 4;
227
+ hll_sketch sk1(lgK, HLL_8);
228
+ hll_sketch sk2(lgK, HLL_8);
229
+ int n = 1 << 20;
230
+ for (int i = 0; i < n; ++i) {
231
+ sk1.update(i);
232
+ sk2.update(i + n);
233
+ }
234
+ hll_union hllUnion(lgK);
235
+ hllUnion.update(sk1);
236
+ hllUnion.update(sk2);
237
+
238
+ hll_sketch rsk1 = hllUnion.get_result(HLL_4);
239
+ hll_sketch rsk2 = hllUnion.get_result(HLL_6);
240
+ hll_sketch rsk3 = hllUnion.get_result(HLL_8);
241
+ double est1 = rsk1.get_estimate();
242
+ double est2 = rsk2.get_estimate();
243
+ double est3 = rsk3.get_estimate();
244
+ REQUIRE(est1 == est2);
245
+ REQUIRE(est1 == est3);
246
+ }
247
+
248
+ TEST_CASE("hll union: check input types", "[hll_union]") {
249
+ hll_union u(8);
250
+
251
+ // inserting the same value as a variety of input types
252
+ u.update((uint8_t) 102);
253
+ u.update((uint16_t) 102);
254
+ u.update((uint32_t) 102);
255
+ u.update((uint64_t) 102);
256
+ u.update((int8_t) 102);
257
+ u.update((int16_t) 102);
258
+ u.update((int32_t) 102);
259
+ u.update((int64_t) 102);
260
+ REQUIRE(u.get_estimate() == Approx(1.0).margin(0.01));
261
+
262
+ // identical binary representations
263
+ // no unsigned in Java, but need to sign-extend both as Java would do
264
+ u.update((uint8_t) 255);
265
+ u.update((int8_t) -1);
266
+
267
+ u.update((float) -2.0);
268
+ u.update((double) -2.0);
269
+
270
+ std::string str = "input string";
271
+ u.update(str);
272
+ u.update(str.c_str(), str.length());
273
+ REQUIRE(u.get_estimate() == Approx(4.0).margin(0.01));
274
+
275
+ u = hll_union(8);
276
+ u.update((float) 0.0);
277
+ u.update((float) -0.0);
278
+ u.update((double) 0.0);
279
+ u.update((double) -0.0);
280
+ REQUIRE(u.get_estimate() == Approx(1.0).margin(0.01));
281
+
282
+ u = hll_union(8);
283
+ u.update(std::nanf("3"));
284
+ u.update(std::nan("12"));
285
+ REQUIRE(1.0 == Approx(u.get_estimate()).margin(0.01));
286
+ REQUIRE(u.get_result().get_estimate() == Approx(u.get_estimate()).margin(0.01));
287
+
288
+ u = hll_union(8);
289
+ u.update(nullptr, 0);
290
+ u.update("");
291
+ REQUIRE(u.is_empty());
292
+ }
293
+
294
+ static void union_two_sketches_with_overlap(int num, uint8_t lg_k, target_hll_type type) {
295
+ hll_sketch sketch1(lg_k, type);
296
+ for (int key = 0; key < num; key++) sketch1.update(key);
297
+
298
+ const int overlap = num / 2;
299
+ hll_sketch sketch2(lg_k, type);
300
+ for (int key = overlap; key < num + overlap; key++) sketch2.update(key);
301
+
302
+ hll_union u(lg_k);
303
+ u.update(sketch1);
304
+ u.update(sketch2);
305
+ hll_sketch sketch = u.get_result(type);
306
+ REQUIRE(sketch.get_estimate() == Approx(num * 1.5).margin(num * 1.5 * 0.02));
307
+ }
308
+
309
+ TEST_CASE("hll union: check hll to hll", "[hll_union]") {
310
+ union_two_sketches_with_overlap(1000000, 11, HLL_4);
311
+ }
312
+
313
+ } /* namespace datasketches */
@@ -0,0 +1,141 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <cstdio>
23
+
24
+ #include "hll.hpp"
25
+ #include "HllUtil.hpp"
26
+
27
+ namespace datasketches {
28
+
29
+ /*
30
+ // hex format for comparing serialized bytes
31
+ // previously used with cppunit testing to display results upon mismatch.
32
+ // catch2 testing framework provides such output, but this may be easier for debugging
33
+ // with long vectors. keeping the code for now.
34
+ static std::string toString(const datasketches::hll_sketch::vector_bytes& v) {
35
+ std::ostringstream s;
36
+ s << std::hex << std::setfill('0');
37
+ int cnt = 0;
38
+ for (uint8_t byte: v) {
39
+ if (cnt == 8) { // insert space after each 8 bytes for readability
40
+ s << ' ';
41
+ cnt = 0;
42
+ } else {
43
+ ++cnt;
44
+ }
45
+ s << std::setw(2) << static_cast<int>(byte);
46
+ }
47
+ return s.str();
48
+ }
49
+ */
50
+
51
+ // if lg_k >= 8, mode != SET!
52
+ static int get_n(int lg_k, hll_mode mode) {
53
+ if (mode == LIST) return 4;
54
+ if (mode == SET) return 1 << (lg_k - 4);
55
+ return ((lg_k < 8) && (mode == HLL)) ? (1 << lg_k) : 1 << (lg_k - 3);
56
+ }
57
+
58
+ static long v = 0;
59
+
60
+ static hll_sketch build_sketch(int lg_k, target_hll_type hll_type, hll_mode mode) {
61
+ hll_sketch sk(lg_k, hll_type);
62
+ int n = get_n(lg_k, mode);
63
+ for (int i = 0; i < n; i++) sk.update(static_cast<uint64_t>(i + v));
64
+ v += n;
65
+ return sk;
66
+ }
67
+
68
+ // merges a sketch to an empty union and gets result of the same type, checks binary equivalence
69
+ static void union_one_update(bool compact) {
70
+ for (int lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
71
+ for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
72
+ if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
73
+ for (int t = 0; t <= 2; t++) { // HLL_4, HLL_6, HLL_8
74
+ target_hll_type hll_type = (target_hll_type) t;
75
+ hll_sketch sk1 = build_sketch(lg_k, hll_type, (hll_mode) mode);
76
+ hll_union u(lg_k);
77
+ u.update(sk1);
78
+ hll_sketch sk2 = u.get_result(hll_type);
79
+ auto bytes1 = compact ? sk1.serialize_compact() : sk1.serialize_updatable();
80
+ auto bytes2 = compact ? sk2.serialize_compact() : sk2.serialize_updatable();
81
+ auto msg = "LgK=" + std::to_string(lg_k)
82
+ + ", Mode=" + std::to_string(mode)
83
+ + ", Type=" + std::to_string(hll_type)
84
+ + "\n" + sk1.to_string(true, true, true, true)
85
+ + "\n" + sk2.to_string(true, true, true, true);
86
+ if (bytes1 != bytes2) {
87
+ std::cerr << msg << std::endl;
88
+ REQUIRE(bytes1 == bytes2);
89
+ }
90
+ }
91
+ }
92
+ }
93
+ }
94
+
95
+ TEST_CASE("hll isomorphic: union one update serialize updatable", "[hll_isomorphic]") {
96
+ union_one_update(false);
97
+ }
98
+
99
+ TEST_CASE("hll isomorphic: union one update serialize compact", "[hll_isomorphic]") {
100
+ union_one_update(true);
101
+ }
102
+
103
+ // converts a sketch to a different type and converts back to the original type to check binary equivalence
104
+ static void convert_back_and_forth(bool compact) {
105
+ for (int lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
106
+ for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
107
+ if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
108
+ for (int t1 = 0; t1 <= 2; t1++) { // HLL_4, HLL_6, HLL_8
109
+ target_hll_type hll_type1 = (target_hll_type) t1;
110
+ hll_sketch sk1 = build_sketch(lg_k, hll_type1, (hll_mode) mode);
111
+ auto bytes1 = compact ? sk1.serialize_compact() : sk1.serialize_updatable();
112
+ for (int t2 = 0; t2 <= 2; t2++) { // HLL_4, HLL_6, HLL_8
113
+ if (t2 == t1) continue;
114
+ target_hll_type hll_type2 = (target_hll_type) t2;
115
+ hll_sketch sk2(hll_sketch(sk1, hll_type2), hll_type1);
116
+ auto bytes2 = compact ? sk2.serialize_compact() : sk2.serialize_updatable();
117
+ auto msg = "LgK=" + std::to_string(lg_k)
118
+ + ", Mode=" + std::to_string(mode)
119
+ + ", Type1=" + std::to_string(hll_type1)
120
+ + ", Type2=" + std::to_string(hll_type2)
121
+ + "\n" + sk1.to_string(true, true, true, true)
122
+ + "\n" + sk2.to_string(true, true, true, true);
123
+ if (bytes1 != bytes2) {
124
+ std::cerr << msg << std::endl;
125
+ REQUIRE(bytes1 == bytes2);
126
+ }
127
+ }
128
+ }
129
+ }
130
+ }
131
+ }
132
+
133
+ TEST_CASE("hll isomorphic: convert back and forth serialize updatable", "[hll_isomorphic]") {
134
+ convert_back_and_forth(false);
135
+ }
136
+
137
+ TEST_CASE("hll isomorphic: convert back and forth serialize compact", "[hll_isomorphic]") {
138
+ convert_back_and_forth(true);
139
+ }
140
+
141
+ } /* namespace datasketches */