datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,122 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_UNION_HPP_
21
+ #define THETA_UNION_HPP_
22
+
23
+ #include <memory>
24
+ #include <functional>
25
+ #include <climits>
26
+
27
+ #include "theta_sketch.hpp"
28
+
29
+ namespace datasketches {
30
+
31
+ /*
32
+ * author Alexander Saydakov
33
+ * author Lee Rhodes
34
+ * author Kevin Lang
35
+ */
36
+
37
+ template<typename A>
38
+ class theta_union_alloc {
39
+ public:
40
+ class builder;
41
+
42
+ // No constructor here. Use builder instead.
43
+
44
+ /**
45
+ * This method is to update the union with a given sketch
46
+ * @param sketch to update the union with
47
+ */
48
+ void update(const theta_sketch_alloc<A>& sketch);
49
+
50
+ /**
51
+ * This method produces a copy of the current state of the union as a compact sketch.
52
+ * @param ordered optional flag to specify if ordered sketch should be produced
53
+ * @return the result of the union
54
+ */
55
+ compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
56
+
57
+ private:
58
+ bool is_empty_;
59
+ uint64_t theta_;
60
+ update_theta_sketch_alloc<A> state_;
61
+
62
+ // for builder
63
+ theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state);
64
+ };
65
+
66
+ // builder
67
+
68
+ template<typename A>
69
+ class theta_union_alloc<A>::builder {
70
+ public:
71
+ typedef typename update_theta_sketch_alloc<A>::resize_factor resize_factor;
72
+
73
+ /**
74
+ * Set log2(k), where k is a nominal number of entries in the sketch
75
+ * @param lg_k base 2 logarithm of nominal number of entries
76
+ * @return this builder
77
+ */
78
+ builder& set_lg_k(uint8_t lg_k);
79
+
80
+ /**
81
+ * Set resize factor for the internal hash table (defaults to 8)
82
+ * @param rf resize factor
83
+ * @return this builder
84
+ */
85
+ builder& set_resize_factor(resize_factor rf);
86
+
87
+ /**
88
+ * Set sampling probability (initial theta). The default is 1, so the sketch retains
89
+ * all entries until it reaches the limit, at which point it goes into the estimation mode
90
+ * and reduces the effective sampling probability (theta) as necessary.
91
+ * @param p sampling probability
92
+ * @return this builder
93
+ */
94
+ builder& set_p(float p);
95
+
96
+ /**
97
+ * Set the seed for the hash function. Should be used carefully if needed.
98
+ * Sketches produced with different seed are not compatible
99
+ * and cannot be mixed in set operations.
100
+ * @param seed hash seed
101
+ * @return this builder
102
+ */
103
+ builder& set_seed(uint64_t seed);
104
+
105
+ /**
106
+ * This is to create an instance of the union with predefined parameters.
107
+ * @return and instance of the union
108
+ */
109
+ theta_union_alloc<A> build() const;
110
+
111
+ private:
112
+ typename update_theta_sketch_alloc<A>::builder sketch_builder;
113
+ };
114
+
115
+ // alias with default allocator for convenience
116
+ typedef theta_union_alloc<std::allocator<void>> theta_union;
117
+
118
+ } /* namespace datasketches */
119
+
120
+ #include "theta_union_impl.hpp"
121
+
122
+ # endif
@@ -0,0 +1,109 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_UNION_IMPL_HPP_
21
+ #define THETA_UNION_IMPL_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ /*
26
+ * author Alexander Saydakov
27
+ * author Lee Rhodes
28
+ * author Kevin Lang
29
+ */
30
+
31
+ template<typename A>
32
+ theta_union_alloc<A>::theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state):
33
+ is_empty_(true), theta_(theta), state_(std::move(state)) {}
34
+
35
+ template<typename A>
36
+ void theta_union_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
37
+ if (sketch.is_empty()) return;
38
+ if (sketch.get_seed_hash() != state_.get_seed_hash()) throw std::invalid_argument("seed hash mismatch");
39
+ is_empty_ = false;
40
+ if (sketch.get_theta64() < theta_) theta_ = sketch.get_theta64();
41
+ if (sketch.is_ordered()) {
42
+ for (auto hash: sketch) {
43
+ if (hash >= theta_) break; // early stop
44
+ state_.internal_update(hash);
45
+ }
46
+ } else {
47
+ for (auto hash: sketch) if (hash < theta_) state_.internal_update(hash);
48
+ }
49
+ if (state_.get_theta64() < theta_) theta_ = state_.get_theta64();
50
+ }
51
+
52
+ template<typename A>
53
+ compact_theta_sketch_alloc<A> theta_union_alloc<A>::get_result(bool ordered) const {
54
+ if (is_empty_) return state_.compact(ordered);
55
+ const uint32_t nom_num_keys = 1 << state_.lg_nom_size_;
56
+ if (theta_ >= state_.theta_ && state_.get_num_retained() <= nom_num_keys) return state_.compact(ordered);
57
+ uint64_t theta = std::min(theta_, state_.get_theta64());
58
+ vector_u64<A> keys(state_.get_num_retained());
59
+ uint32_t num_keys = 0;
60
+ for (auto key: state_) {
61
+ if (key < theta) keys[num_keys++] = key;
62
+ }
63
+ if (num_keys > nom_num_keys) {
64
+ std::nth_element(keys.begin(), keys.begin() + nom_num_keys, keys.begin() + num_keys);
65
+ theta = keys[nom_num_keys];
66
+ num_keys = nom_num_keys;
67
+ }
68
+ if (num_keys != state_.get_num_retained()) {
69
+ keys.resize(num_keys);
70
+ }
71
+ if (ordered) std::sort(keys.begin(), keys.end());
72
+ return compact_theta_sketch_alloc<A>(false, theta, std::move(keys), state_.get_seed_hash(), ordered);
73
+ }
74
+
75
+ // builder
76
+
77
+ template<typename A>
78
+ typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
79
+ sketch_builder.set_lg_k(lg_k);
80
+ return *this;
81
+ }
82
+
83
+ template<typename A>
84
+ typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_resize_factor(resize_factor rf) {
85
+ sketch_builder.set_resize_factor(rf);
86
+ return *this;
87
+ }
88
+
89
+ template<typename A>
90
+ typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_p(float p) {
91
+ sketch_builder.set_p(p);
92
+ return *this;
93
+ }
94
+
95
+ template<typename A>
96
+ typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_seed(uint64_t seed) {
97
+ sketch_builder.set_seed(seed);
98
+ return *this;
99
+ }
100
+
101
+ template<typename A>
102
+ theta_union_alloc<A> theta_union_alloc<A>::builder::build() const {
103
+ update_theta_sketch_alloc<A> sketch = sketch_builder.build();
104
+ return theta_union_alloc(sketch.get_theta64(), std::move(sketch));
105
+ }
106
+
107
+ } /* namespace datasketches */
108
+
109
+ # endif
@@ -0,0 +1,45 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_executable(theta_test)
19
+
20
+ target_link_libraries(theta_test theta common_test)
21
+
22
+ set_target_properties(theta_test PROPERTIES
23
+ CXX_STANDARD 11
24
+ CXX_STANDARD_REQUIRED YES
25
+ )
26
+
27
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" THETA_TEST_BINARY_PATH)
28
+ string(APPEND THETA_TEST_BINARY_PATH "/")
29
+ target_compile_definitions(theta_test
30
+ PRIVATE
31
+ TEST_BINARY_INPUT_PATH="${THETA_TEST_BINARY_PATH}"
32
+ )
33
+
34
+ add_test(
35
+ NAME theta_test
36
+ COMMAND theta_test
37
+ )
38
+
39
+ target_sources(theta_test
40
+ PRIVATE
41
+ theta_sketch_test.cpp
42
+ theta_union_test.cpp
43
+ theta_intersection_test.cpp
44
+ theta_a_not_b_test.cpp
45
+ )
@@ -0,0 +1,244 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <theta_a_not_b.hpp>
23
+
24
+ namespace datasketches {
25
+
26
+ TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
27
+ theta_a_not_b a_not_b;
28
+ update_theta_sketch a = update_theta_sketch::builder().build();
29
+ update_theta_sketch b = update_theta_sketch::builder().build();
30
+ compact_theta_sketch result = a_not_b.compute(a, b);
31
+ REQUIRE(result.get_num_retained() == 0);
32
+ REQUIRE(result.is_empty());
33
+ REQUIRE_FALSE(result.is_estimation_mode());
34
+ REQUIRE(result.get_estimate() == 0.0);
35
+ }
36
+
37
+ TEST_CASE("theta a-not-b: non empty no retained keys", "[theta_a_not_b]") {
38
+ update_theta_sketch a = update_theta_sketch::builder().build();
39
+ a.update(1);
40
+ update_theta_sketch b = update_theta_sketch::builder().set_p(0.001).build();
41
+ theta_a_not_b a_not_b;
42
+
43
+ // B is still empty
44
+ compact_theta_sketch result = a_not_b.compute(a, b);
45
+ REQUIRE_FALSE(result.is_empty());
46
+ REQUIRE_FALSE(result.is_estimation_mode());
47
+ REQUIRE(result.get_num_retained() == 1);
48
+ REQUIRE(result.get_theta() == Approx(1).margin(1e-10));
49
+ REQUIRE(result.get_estimate() == 1.0);
50
+
51
+ // B is not empty in estimation mode and no entries
52
+ b.update(1);
53
+ REQUIRE(b.get_num_retained() == 0U);
54
+
55
+ result = a_not_b.compute(a, b);
56
+ REQUIRE_FALSE(result.is_empty());
57
+ REQUIRE(result.is_estimation_mode());
58
+ REQUIRE(result.get_num_retained() == 0);
59
+ REQUIRE(result.get_theta() == Approx(0.001).margin(1e-10));
60
+ REQUIRE(result.get_estimate() == 0.0);
61
+ }
62
+
63
+ TEST_CASE("theta a-not-b: exact mode half overlap", "[theta_a_not_b]") {
64
+ update_theta_sketch a = update_theta_sketch::builder().build();
65
+ int value = 0;
66
+ for (int i = 0; i < 1000; i++) a.update(value++);
67
+
68
+ update_theta_sketch b = update_theta_sketch::builder().build();
69
+ value = 500;
70
+ for (int i = 0; i < 1000; i++) b.update(value++);
71
+
72
+ theta_a_not_b a_not_b;
73
+
74
+ // unordered inputs, ordered result
75
+ compact_theta_sketch result = a_not_b.compute(a, b);
76
+ REQUIRE_FALSE(result.is_empty());
77
+ REQUIRE_FALSE(result.is_estimation_mode());
78
+ REQUIRE(result.is_ordered());
79
+ REQUIRE(result.get_estimate() == 500.0);
80
+
81
+ // unordered inputs, unordered result
82
+ result = a_not_b.compute(a, b, false);
83
+ REQUIRE_FALSE(result.is_empty());
84
+ REQUIRE_FALSE(result.is_estimation_mode());
85
+ REQUIRE_FALSE(result.is_ordered());
86
+ REQUIRE(result.get_estimate() == 500.0);
87
+
88
+ // ordered inputs
89
+ result = a_not_b.compute(a.compact(), b.compact());
90
+ REQUIRE_FALSE(result.is_empty());
91
+ REQUIRE_FALSE(result.is_estimation_mode());
92
+ REQUIRE(result.is_ordered());
93
+ REQUIRE(result.get_estimate() == 500.0);
94
+
95
+ // A is ordered, so the result is ordered regardless
96
+ result = a_not_b.compute(a.compact(), b, false);
97
+ REQUIRE_FALSE(result.is_empty());
98
+ REQUIRE_FALSE(result.is_estimation_mode());
99
+ REQUIRE(result.is_ordered());
100
+ REQUIRE(result.get_estimate() == 500.0);
101
+ }
102
+
103
+ TEST_CASE("theta a-not-b: exact mode disjoint", "[theta_a_not_b]") {
104
+ update_theta_sketch a = update_theta_sketch::builder().build();
105
+ int value = 0;
106
+ for (int i = 0; i < 1000; i++) a.update(value++);
107
+
108
+ update_theta_sketch b = update_theta_sketch::builder().build();
109
+ for (int i = 0; i < 1000; i++) b.update(value++);
110
+
111
+ theta_a_not_b a_not_b;
112
+
113
+ // unordered inputs
114
+ compact_theta_sketch result = a_not_b.compute(a, b);
115
+ REQUIRE_FALSE(result.is_empty());
116
+ REQUIRE_FALSE(result.is_estimation_mode());
117
+ REQUIRE(result.get_estimate() == 1000.0);
118
+
119
+ // ordered inputs
120
+ result = a_not_b.compute(a.compact(), b.compact());
121
+ REQUIRE_FALSE(result.is_empty());
122
+ REQUIRE_FALSE(result.is_estimation_mode());
123
+ REQUIRE(result.get_estimate() == 1000.0);
124
+ }
125
+
126
+ TEST_CASE("theta a-not-b: exact mode full overlap", "[theta_a_not_b]") {
127
+ update_theta_sketch sketch = update_theta_sketch::builder().build();
128
+ int value = 0;
129
+ for (int i = 0; i < 1000; i++) sketch.update(value++);
130
+
131
+ theta_a_not_b a_not_b;
132
+
133
+ // unordered inputs
134
+ compact_theta_sketch result = a_not_b.compute(sketch, sketch);
135
+ REQUIRE(result.is_empty());
136
+ REQUIRE_FALSE(result.is_estimation_mode());
137
+ REQUIRE(result.get_estimate() == 0.0);
138
+
139
+ // ordered inputs
140
+ result = a_not_b.compute(sketch.compact(), sketch.compact());
141
+ REQUIRE(result.is_empty());
142
+ REQUIRE_FALSE(result.is_estimation_mode());
143
+ REQUIRE(result.get_estimate() == 0.0);
144
+ }
145
+
146
+ TEST_CASE("theta a-not-b: estimation mode half overlap", "[theta_a_not_b]") {
147
+ update_theta_sketch a = update_theta_sketch::builder().build();
148
+ int value = 0;
149
+ for (int i = 0; i < 10000; i++) a.update(value++);
150
+
151
+ update_theta_sketch b = update_theta_sketch::builder().build();
152
+ value = 5000;
153
+ for (int i = 0; i < 10000; i++) b.update(value++);
154
+
155
+ theta_a_not_b a_not_b;
156
+
157
+ // unordered inputs
158
+ compact_theta_sketch result = a_not_b.compute(a, b);
159
+ REQUIRE_FALSE(result.is_empty());
160
+ REQUIRE(result.is_estimation_mode());
161
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
162
+
163
+ // ordered inputs
164
+ result = a_not_b.compute(a.compact(), b.compact());
165
+ REQUIRE_FALSE(result.is_empty());
166
+ REQUIRE(result.is_estimation_mode());
167
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
168
+ }
169
+
170
+ TEST_CASE("theta a-not-b: estimation mode disjoint", "[theta_a_not_b]") {
171
+ update_theta_sketch a = update_theta_sketch::builder().build();
172
+ int value = 0;
173
+ for (int i = 0; i < 10000; i++) a.update(value++);
174
+
175
+ update_theta_sketch b = update_theta_sketch::builder().build();
176
+ for (int i = 0; i < 10000; i++) b.update(value++);
177
+
178
+ theta_a_not_b a_not_b;
179
+
180
+ // unordered inputs
181
+ compact_theta_sketch result = a_not_b.compute(a, b);
182
+ REQUIRE_FALSE(result.is_empty());
183
+ REQUIRE(result.is_estimation_mode());
184
+ REQUIRE(result.get_estimate() == Approx(10000).margin(10000 * 0.02));
185
+
186
+ // ordered inputs
187
+ result = a_not_b.compute(a.compact(), b.compact());
188
+ REQUIRE_FALSE(result.is_empty());
189
+ REQUIRE(result.is_estimation_mode());
190
+ REQUIRE(result.get_estimate() == Approx(10000).margin(10000 * 0.02));
191
+ }
192
+
193
+ TEST_CASE("theta a-not-b: estimation mode full overlap", "[theta_a_not_b]") {
194
+ update_theta_sketch sketch = update_theta_sketch::builder().build();
195
+ int value = 0;
196
+ for (int i = 0; i < 10000; i++) sketch.update(value++);
197
+
198
+ theta_a_not_b a_not_b;
199
+
200
+ // unordered inputs
201
+ compact_theta_sketch result = a_not_b.compute(sketch, sketch);
202
+ REQUIRE_FALSE(result.is_empty());
203
+ REQUIRE(result.is_estimation_mode());
204
+ REQUIRE(result.get_estimate() == 0.0);
205
+
206
+ // ordered inputs
207
+ result = a_not_b.compute(sketch.compact(), sketch.compact());
208
+ REQUIRE_FALSE(result.is_empty());
209
+ REQUIRE(result.is_estimation_mode());
210
+ REQUIRE(result.get_estimate() == 0.0);
211
+ }
212
+
213
+ TEST_CASE("theta a-not-b: seed mismatch", "[theta_a_not_b]") {
214
+ update_theta_sketch sketch = update_theta_sketch::builder().build();
215
+ sketch.update(1); // non-empty should not be ignored
216
+ theta_a_not_b a_not_b(123);
217
+ REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
218
+ }
219
+
220
+ TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") {
221
+ update_theta_sketch a = update_theta_sketch::builder().build();
222
+ int value = 0;
223
+ for (int i = 0; i < 10000; i++) a.update(value++);
224
+
225
+ update_theta_sketch b = update_theta_sketch::builder().build();
226
+ value = 5000;
227
+ for (int i = 0; i < 25000; i++) b.update(value++);
228
+
229
+ theta_a_not_b a_not_b;
230
+
231
+ // unordered inputs
232
+ compact_theta_sketch result = a_not_b.compute(a, b);
233
+ REQUIRE_FALSE(result.is_empty());
234
+ REQUIRE(result.is_estimation_mode());
235
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
236
+
237
+ // ordered inputs
238
+ result = a_not_b.compute(a.compact(), b.compact());
239
+ REQUIRE_FALSE(result.is_empty());
240
+ REQUIRE(result.is_estimation_mode());
241
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
242
+ }
243
+
244
+ } /* namespace datasketches */