datasketches 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,313 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+ #include <sstream>
22
+
23
+ #include "hll.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ static int min(int a, int b) {
28
+ return (a < b) ? a : b;
29
+ }
30
+
31
+ static void println(std::string& str) {
32
+ //std::cout << str << "\n";
33
+ }
34
+
35
+ static void basicUnion(uint64_t n1, uint64_t n2,
36
+ uint64_t lgk1, uint64_t lgk2, uint64_t lgMaxK,
37
+ target_hll_type type1, target_hll_type type2, target_hll_type resultType) {
38
+ uint64_t v = 0;
39
+ //int tot = n1 + n2;
40
+
41
+ hll_sketch h1(lgk1, type1);
42
+ hll_sketch h2(lgk2, type2);
43
+ int lgControlK = min(min(lgk1, lgk2), lgMaxK);
44
+ hll_sketch control(lgControlK, resultType);
45
+
46
+ for (uint64_t i = 0; i < n1; ++i) {
47
+ h1.update(v + i);
48
+ control.update(v + i);
49
+ }
50
+ v += n1;
51
+ for (uint64_t i = 0; i < n2; ++i) {
52
+ h2.update(v + i);
53
+ control.update(v + i);
54
+ }
55
+ v += n2;
56
+
57
+ hll_union u(lgMaxK);
58
+ u.update(std::move(h1));
59
+ u.update(h2);
60
+
61
+ hll_sketch result = u.get_result(resultType);
62
+
63
+ // force non-HIP estimates to avoid issues with in- vs out-of-order
64
+ double uEst = result.get_composite_estimate();
65
+ double uUb = result.get_upper_bound(2);
66
+ double uLb = result.get_lower_bound(2);
67
+ //double rerr = ((uEst/tot) - 1.0) * 100;
68
+
69
+ double controlEst = control.get_composite_estimate();
70
+ double controlUb = control.get_upper_bound(2);
71
+ double controlLb = control.get_lower_bound(2);
72
+
73
+ REQUIRE((controlUb - controlEst) >= 0.0);
74
+ REQUIRE((uUb - uEst) >= 0.0);
75
+ REQUIRE((controlEst - controlLb) >= 0.0);
76
+ REQUIRE((uEst - uLb) >= 0.0);
77
+
78
+ REQUIRE(controlEst == uEst);
79
+ }
80
+
81
+ /**
82
+ * The task here is to check the transition boundaries as the sketch morphs between LIST to
83
+ * SET to HLL modes. The transition points vary as a function of lgConfigK. In addition,
84
+ * this checks that the union operation is operating properly based on the order the
85
+ * sketches are presented to the union.
86
+ */
87
+ TEST_CASE("hll union: check unions", "[hll_union]") {
88
+ target_hll_type type1 = HLL_8;
89
+ target_hll_type type2 = HLL_8;
90
+ target_hll_type resultType = HLL_8;
91
+
92
+ uint64_t lgK1 = 7;
93
+ uint64_t lgK2 = 7;
94
+ uint64_t lgMaxK = 7;
95
+ uint64_t n1 = 7;
96
+ uint64_t n2 = 7;
97
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
98
+ n1 = 8;
99
+ n2 = 7;
100
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
101
+ n1 = 7;
102
+ n2 = 8;
103
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
104
+ n1 = 8;
105
+ n2 = 8;
106
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
107
+ n1 = 7;
108
+ n2 = 14;
109
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
110
+
111
+ int i = 0;
112
+ for (i = 7; i <= 13; ++i) {
113
+ lgK1 = i;
114
+ lgK2 = i;
115
+ lgMaxK = i;
116
+ {
117
+ n1 = ((1 << (i - 3)) * 3)/4; // compute the transition point
118
+ n2 = n1;
119
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
120
+ n1 += 2;
121
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
122
+ n1 -= 2;
123
+ n2 += 2;
124
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
125
+ n1 += 2;
126
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
127
+ }
128
+ lgK1 = i;
129
+ lgK2 = i + 1;
130
+ lgMaxK = i;
131
+ {
132
+ n1 = ((1 << (i - 3)) * 3)/4;
133
+ n2 = n1;
134
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
135
+ n1 += 2;
136
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
137
+ n1 -= 2;
138
+ n2 += 2;
139
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
140
+ n1 += 2;
141
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
142
+ }
143
+ lgK1 = i + 1;
144
+ lgK2 = i;
145
+ lgMaxK = i;
146
+ {
147
+ n1 = ((1 << (i - 3)) * 3)/4;
148
+ n2 = n1;
149
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
150
+ n1 += 2;
151
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
152
+ n1 -= 2;
153
+ n2 += 2;
154
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
155
+ n1 += 2;
156
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
157
+ }
158
+ lgK1 = i + 1;
159
+ lgK2 = i + 1;
160
+ lgMaxK = i;
161
+ {
162
+ n1 = ((1 << (i - 3)) * 3)/4;
163
+ n2 = n1;
164
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
165
+ n1 += 2;
166
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
167
+ n1 -= 2;
168
+ n2 += 2;
169
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
170
+ n1 += 2;
171
+ basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
172
+ }
173
+ }
174
+ }
175
+
176
+ TEST_CASE("hll union: check composite estimate", "[hll_union]") {
177
+ hll_union u(12);
178
+ REQUIRE(u.is_empty());
179
+ REQUIRE(u.get_composite_estimate() == Approx(0.0).margin(0.03));
180
+ for (int i = 1; i <= 15; ++i) { u.update(i); }
181
+ REQUIRE(u.get_composite_estimate() == Approx(15.0).margin(15 * 0.03));
182
+ for (int i = 16; i <= 1000; ++i) { u.update(i); }
183
+ REQUIRE(u.get_composite_estimate() == Approx(1000.0).margin(1000 * 0.03));
184
+ }
185
+
186
+ TEST_CASE("hll union: check config k limits", "[hll_union]") {
187
+ REQUIRE_THROWS_AS(hll_union(HllUtil<>::MIN_LOG_K - 1), std::invalid_argument);
188
+
189
+ REQUIRE_THROWS_AS(hll_union(HllUtil<>::MAX_LOG_K + 1), std::invalid_argument);
190
+ }
191
+
192
+ static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est) {
193
+ double re = RelativeErrorTables<>::getRelErr(ub, oooFlag, lgK, numStdDev);
194
+ return est / (1.0 + re);
195
+ }
196
+
197
+ TEST_CASE("hll union: check ub lb", "[hll_union]") {
198
+ int lgK = 4;
199
+ int n = 1 << 20;
200
+ bool oooFlag = false;
201
+
202
+ double bound;
203
+ std::string str;
204
+
205
+ bound = (getBound(lgK, true, oooFlag, 3, n) / n) - 1;
206
+ str = "LgK=" + std::to_string(lgK) + ", UB3: " + std::to_string(bound);
207
+ println(str);
208
+ bound = (getBound(lgK, true, oooFlag, 2, n) / n) - 1;
209
+ str = "LgK=" + std::to_string(lgK) + ", UB2: " + std::to_string(bound);
210
+ println(str);
211
+ bound = (getBound(lgK, true, oooFlag, 1, n) / n) - 1;
212
+ str = "LgK=" + std::to_string(lgK) + ", UB1: " + std::to_string(bound);
213
+ println(str);
214
+ bound = (getBound(lgK, false, oooFlag, 1, n) / n) - 1;
215
+ str = "LgK=" + std::to_string(lgK) + ", LB1: " + std::to_string(bound);
216
+ println(str);
217
+ bound = (getBound(lgK, false, oooFlag, 2, n) / n) - 1;
218
+ str = "LgK=" + std::to_string(lgK) + ", LB2: " + std::to_string(bound);
219
+ println(str);
220
+ bound = (getBound(lgK, false, oooFlag, 3, n) / n) - 1;
221
+ str = "LgK=" + std::to_string(lgK) + ", LB3: " + std::to_string(bound);
222
+ println(str);
223
+ }
224
+
225
+ TEST_CASE("hll union: check conversions", "[hll_union]") {
226
+ int lgK = 4;
227
+ hll_sketch sk1(lgK, HLL_8);
228
+ hll_sketch sk2(lgK, HLL_8);
229
+ int n = 1 << 20;
230
+ for (int i = 0; i < n; ++i) {
231
+ sk1.update(i);
232
+ sk2.update(i + n);
233
+ }
234
+ hll_union hllUnion(lgK);
235
+ hllUnion.update(sk1);
236
+ hllUnion.update(sk2);
237
+
238
+ hll_sketch rsk1 = hllUnion.get_result(HLL_4);
239
+ hll_sketch rsk2 = hllUnion.get_result(HLL_6);
240
+ hll_sketch rsk3 = hllUnion.get_result(HLL_8);
241
+ double est1 = rsk1.get_estimate();
242
+ double est2 = rsk2.get_estimate();
243
+ double est3 = rsk3.get_estimate();
244
+ REQUIRE(est1 == est2);
245
+ REQUIRE(est1 == est3);
246
+ }
247
+
248
+ TEST_CASE("hll union: check input types", "[hll_union]") {
249
+ hll_union u(8);
250
+
251
+ // inserting the same value as a variety of input types
252
+ u.update((uint8_t) 102);
253
+ u.update((uint16_t) 102);
254
+ u.update((uint32_t) 102);
255
+ u.update((uint64_t) 102);
256
+ u.update((int8_t) 102);
257
+ u.update((int16_t) 102);
258
+ u.update((int32_t) 102);
259
+ u.update((int64_t) 102);
260
+ REQUIRE(u.get_estimate() == Approx(1.0).margin(0.01));
261
+
262
+ // identical binary representations
263
+ // no unsigned in Java, but need to sign-extend both as Java would do
264
+ u.update((uint8_t) 255);
265
+ u.update((int8_t) -1);
266
+
267
+ u.update((float) -2.0);
268
+ u.update((double) -2.0);
269
+
270
+ std::string str = "input string";
271
+ u.update(str);
272
+ u.update(str.c_str(), str.length());
273
+ REQUIRE(u.get_estimate() == Approx(4.0).margin(0.01));
274
+
275
+ u = hll_union(8);
276
+ u.update((float) 0.0);
277
+ u.update((float) -0.0);
278
+ u.update((double) 0.0);
279
+ u.update((double) -0.0);
280
+ REQUIRE(u.get_estimate() == Approx(1.0).margin(0.01));
281
+
282
+ u = hll_union(8);
283
+ u.update(std::nanf("3"));
284
+ u.update(std::nan("12"));
285
+ REQUIRE(1.0 == Approx(u.get_estimate()).margin(0.01));
286
+ REQUIRE(u.get_result().get_estimate() == Approx(u.get_estimate()).margin(0.01));
287
+
288
+ u = hll_union(8);
289
+ u.update(nullptr, 0);
290
+ u.update("");
291
+ REQUIRE(u.is_empty());
292
+ }
293
+
294
+ static void union_two_sketches_with_overlap(int num, uint8_t lg_k, target_hll_type type) {
295
+ hll_sketch sketch1(lg_k, type);
296
+ for (int key = 0; key < num; key++) sketch1.update(key);
297
+
298
+ const int overlap = num / 2;
299
+ hll_sketch sketch2(lg_k, type);
300
+ for (int key = overlap; key < num + overlap; key++) sketch2.update(key);
301
+
302
+ hll_union u(lg_k);
303
+ u.update(sketch1);
304
+ u.update(sketch2);
305
+ hll_sketch sketch = u.get_result(type);
306
+ REQUIRE(sketch.get_estimate() == Approx(num * 1.5).margin(num * 1.5 * 0.02));
307
+ }
308
+
309
+ TEST_CASE("hll union: check hll to hll", "[hll_union]") {
310
+ union_two_sketches_with_overlap(1000000, 11, HLL_4);
311
+ }
312
+
313
+ } /* namespace datasketches */
@@ -0,0 +1,141 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <cstdio>
23
+
24
+ #include "hll.hpp"
25
+ #include "HllUtil.hpp"
26
+
27
+ namespace datasketches {
28
+
29
+ /*
30
+ // hex format for comparing serialized bytes
31
+ // previously used with cppunit testing to display results upon mismatch.
32
+ // catch2 testing framework provides such output, but this may be easier for debugging
33
+ // with long vectors. keeping the code for now.
34
+ static std::string toString(const datasketches::hll_sketch::vector_bytes& v) {
35
+ std::ostringstream s;
36
+ s << std::hex << std::setfill('0');
37
+ int cnt = 0;
38
+ for (uint8_t byte: v) {
39
+ if (cnt == 8) { // insert space after each 8 bytes for readability
40
+ s << ' ';
41
+ cnt = 0;
42
+ } else {
43
+ ++cnt;
44
+ }
45
+ s << std::setw(2) << static_cast<int>(byte);
46
+ }
47
+ return s.str();
48
+ }
49
+ */
50
+
51
+ // if lg_k >= 8, mode != SET!
52
+ static int get_n(int lg_k, hll_mode mode) {
53
+ if (mode == LIST) return 4;
54
+ if (mode == SET) return 1 << (lg_k - 4);
55
+ return ((lg_k < 8) && (mode == HLL)) ? (1 << lg_k) : 1 << (lg_k - 3);
56
+ }
57
+
58
+ static long v = 0;
59
+
60
+ static hll_sketch build_sketch(int lg_k, target_hll_type hll_type, hll_mode mode) {
61
+ hll_sketch sk(lg_k, hll_type);
62
+ int n = get_n(lg_k, mode);
63
+ for (int i = 0; i < n; i++) sk.update(static_cast<uint64_t>(i + v));
64
+ v += n;
65
+ return sk;
66
+ }
67
+
68
+ // merges a sketch to an empty union and gets result of the same type, checks binary equivalence
69
+ static void union_one_update(bool compact) {
70
+ for (int lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
71
+ for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
72
+ if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
73
+ for (int t = 0; t <= 2; t++) { // HLL_4, HLL_6, HLL_8
74
+ target_hll_type hll_type = (target_hll_type) t;
75
+ hll_sketch sk1 = build_sketch(lg_k, hll_type, (hll_mode) mode);
76
+ hll_union u(lg_k);
77
+ u.update(sk1);
78
+ hll_sketch sk2 = u.get_result(hll_type);
79
+ auto bytes1 = compact ? sk1.serialize_compact() : sk1.serialize_updatable();
80
+ auto bytes2 = compact ? sk2.serialize_compact() : sk2.serialize_updatable();
81
+ auto msg = "LgK=" + std::to_string(lg_k)
82
+ + ", Mode=" + std::to_string(mode)
83
+ + ", Type=" + std::to_string(hll_type)
84
+ + "\n" + sk1.to_string(true, true, true, true)
85
+ + "\n" + sk2.to_string(true, true, true, true);
86
+ if (bytes1 != bytes2) {
87
+ std::cerr << msg << std::endl;
88
+ REQUIRE(bytes1 == bytes2);
89
+ }
90
+ }
91
+ }
92
+ }
93
+ }
94
+
95
+ TEST_CASE("hll isomorphic: union one update serialize updatable", "[hll_isomorphic]") {
96
+ union_one_update(false);
97
+ }
98
+
99
+ TEST_CASE("hll isomorphic: union one update serialize compact", "[hll_isomorphic]") {
100
+ union_one_update(true);
101
+ }
102
+
103
+ // converts a sketch to a different type and converts back to the original type to check binary equivalence
104
+ static void convert_back_and_forth(bool compact) {
105
+ for (int lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
106
+ for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
107
+ if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
108
+ for (int t1 = 0; t1 <= 2; t1++) { // HLL_4, HLL_6, HLL_8
109
+ target_hll_type hll_type1 = (target_hll_type) t1;
110
+ hll_sketch sk1 = build_sketch(lg_k, hll_type1, (hll_mode) mode);
111
+ auto bytes1 = compact ? sk1.serialize_compact() : sk1.serialize_updatable();
112
+ for (int t2 = 0; t2 <= 2; t2++) { // HLL_4, HLL_6, HLL_8
113
+ if (t2 == t1) continue;
114
+ target_hll_type hll_type2 = (target_hll_type) t2;
115
+ hll_sketch sk2(hll_sketch(sk1, hll_type2), hll_type1);
116
+ auto bytes2 = compact ? sk2.serialize_compact() : sk2.serialize_updatable();
117
+ auto msg = "LgK=" + std::to_string(lg_k)
118
+ + ", Mode=" + std::to_string(mode)
119
+ + ", Type1=" + std::to_string(hll_type1)
120
+ + ", Type2=" + std::to_string(hll_type2)
121
+ + "\n" + sk1.to_string(true, true, true, true)
122
+ + "\n" + sk2.to_string(true, true, true, true);
123
+ if (bytes1 != bytes2) {
124
+ std::cerr << msg << std::endl;
125
+ REQUIRE(bytes1 == bytes2);
126
+ }
127
+ }
128
+ }
129
+ }
130
+ }
131
+ }
132
+
133
+ TEST_CASE("hll isomorphic: convert back and forth serialize updatable", "[hll_isomorphic]") {
134
+ convert_back_and_forth(false);
135
+ }
136
+
137
+ TEST_CASE("hll isomorphic: convert back and forth serialize compact", "[hll_isomorphic]") {
138
+ convert_back_and_forth(true);
139
+ }
140
+
141
+ } /* namespace datasketches */