datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,149 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include "cpc_union.hpp"
23
+
24
+ namespace datasketches {
25
+
26
+ static const double RELATIVE_ERROR_FOR_LG_K_11 = 0.02;
27
+
28
+ TEST_CASE("cpc union: lg k limits", "[cpc_union]") {
29
+ cpc_union u1(CPC_MIN_LG_K); // this should work
30
+ cpc_union u2(CPC_MAX_LG_K); // this should work
31
+ REQUIRE_THROWS_AS(cpc_union(CPC_MIN_LG_K - 1), std::invalid_argument);
32
+ REQUIRE_THROWS_AS(cpc_union(CPC_MAX_LG_K + 1), std::invalid_argument);
33
+ }
34
+
35
+ TEST_CASE("cpc union: empty", "[cpc_union]") {
36
+ cpc_union u(11);
37
+ auto s = u.get_result();
38
+ REQUIRE(s.is_empty());
39
+ REQUIRE(s.get_estimate() == 0.0);
40
+ }
41
+
42
+ TEST_CASE("cpc union: copy", "[cpc_union]") {
43
+ cpc_sketch s(11);
44
+ s.update(1);
45
+ cpc_union u1(11);
46
+ u1.update(s);
47
+
48
+ cpc_union u2 = u1; // copy constructor
49
+ auto s1 = u2.get_result();
50
+ REQUIRE_FALSE(s1.is_empty());
51
+ REQUIRE(s1.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
52
+ s.update(2);
53
+ u2.update(s);
54
+ u1 = u2; // operator=
55
+ auto s2 = u1.get_result();
56
+ REQUIRE_FALSE(s2.is_empty());
57
+ REQUIRE(s2.get_estimate() == Approx(2).margin(2 * RELATIVE_ERROR_FOR_LG_K_11));
58
+ }
59
+
60
+ TEST_CASE("cpc union: custom seed", "[cpc_union]") {
61
+ cpc_sketch s(11, 123);
62
+
63
+ s.update(1);
64
+ s.update(2);
65
+ s.update(3);
66
+
67
+ cpc_union u1(11, 123);
68
+ u1.update(s);
69
+ auto r = u1.get_result();
70
+ REQUIRE_FALSE(r.is_empty());
71
+ REQUIRE(r.get_estimate() == Approx(3).margin(3 * RELATIVE_ERROR_FOR_LG_K_11));
72
+
73
+ // incompatible seed
74
+ cpc_union u2(11, 234);
75
+ REQUIRE_THROWS_AS(u2.update(s), std::invalid_argument);
76
+ }
77
+
78
+ TEST_CASE("cpc union: large", "[cpc_union]") {
79
+ int key = 0;
80
+ cpc_sketch s(11);
81
+ cpc_union u(11);
82
+ for (int i = 0; i < 1000; i++) {
83
+ cpc_sketch tmp(11);
84
+ for (int i = 0; i < 10000; i++) {
85
+ s.update(key);
86
+ tmp.update(key);
87
+ key++;
88
+ }
89
+ u.update(tmp);
90
+ }
91
+ cpc_sketch r = u.get_result();
92
+ REQUIRE(r.get_num_coupons() == s.get_num_coupons());
93
+ REQUIRE(r.get_estimate() == Approx(s.get_estimate()).margin(s.get_estimate() * RELATIVE_ERROR_FOR_LG_K_11));
94
+ }
95
+
96
+ TEST_CASE("cpc union: reduce k empty", "[cpc_union]") {
97
+ cpc_sketch s(11);
98
+ for (int i = 0; i < 10000; i++) s.update(i);
99
+ cpc_union u(12);
100
+ u.update(s);
101
+ cpc_sketch r = u.get_result();
102
+ REQUIRE(r.get_lg_k() == 11);
103
+ REQUIRE(r.get_estimate() == Approx(10000).margin(10000 * RELATIVE_ERROR_FOR_LG_K_11));
104
+ }
105
+
106
+ TEST_CASE("cpc union: reduce k sparse", "[cpc_union]") {
107
+ cpc_union u(12);
108
+
109
+ cpc_sketch s12(12);
110
+ for (int i = 0; i < 100; i++) s12.update(i);
111
+ u.update(s12);
112
+
113
+ cpc_sketch s11(11);
114
+ for (int i = 0; i < 1000; i++) s11.update(i);
115
+ u.update(s11);
116
+
117
+ cpc_sketch r = u.get_result();
118
+ REQUIRE(r.get_lg_k() == 11);
119
+ REQUIRE(r.get_estimate() == Approx(1000).margin(1000 * RELATIVE_ERROR_FOR_LG_K_11));
120
+ }
121
+
122
+ TEST_CASE("cpc union: reduce k window", "[cpc_union]") {
123
+ cpc_union u(12);
124
+
125
+ cpc_sketch s12(12);
126
+ for (int i = 0; i < 500; i++) s12.update(i);
127
+ u.update(s12);
128
+
129
+ cpc_sketch s11(11);
130
+ for (int i = 0; i < 1000; i++) s11.update(i);
131
+ u.update(s11);
132
+
133
+ cpc_sketch r = u.get_result();
134
+ REQUIRE(r.get_lg_k() == 11);
135
+ REQUIRE(r.get_estimate() == Approx(1000).margin(1000 * RELATIVE_ERROR_FOR_LG_K_11));
136
+ }
137
+
138
+ TEST_CASE("cpc union: moving update", "[cpc_union]") {
139
+ cpc_union u(11);
140
+ cpc_sketch s(11);
141
+ for (int i = 0; i < 100; i++) s.update(i); // sparse
142
+ u.update(std::move(s));
143
+ cpc_sketch r = u.get_result();
144
+ REQUIRE(r.get_estimate() == Approx(100).margin(100 * RELATIVE_ERROR_FOR_LG_K_11));
145
+ }
146
+
147
+
148
+
149
+ } /* namespace datasketches */
@@ -0,0 +1,54 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(fi INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::FI ALIAS fi)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(fi
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(fi INTERFACE common)
33
+ target_compile_features(fi INTERFACE cxx_std_11)
34
+
35
+ set(fi_HEADERS "")
36
+ list(APPEND fi_HEADERS "include/frequent_items_sketch.hpp")
37
+ list(APPEND fi_HEADERS "include/frequent_items_sketch_impl.hpp")
38
+ list(APPEND fi_HEADERS "include/reverse_purge_hash_map.hpp")
39
+ list(APPEND fi_HEADERS "include/reverse_purge_hash_map_impl.hpp")
40
+
41
+ install(TARGETS fi
42
+ EXPORT ${PROJECT_NAME}
43
+ )
44
+
45
+ install(FILES ${fi_HEADERS}
46
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
47
+
48
+ target_sources(fi
49
+ INTERFACE
50
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch.hpp
51
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch_impl.hpp
52
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map.hpp
53
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map_impl.hpp
54
+ )
@@ -0,0 +1,319 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef FREQUENT_ITEMS_SKETCH_HPP_
21
+ #define FREQUENT_ITEMS_SKETCH_HPP_
22
+
23
+ #include <memory>
24
+ #include <vector>
25
+ #include <iostream>
26
+ #include <functional>
27
+ #include <type_traits>
28
+
29
+ #include "reverse_purge_hash_map.hpp"
30
+ #include "common_defs.hpp"
31
+ #include "serde.hpp"
32
+
33
+ namespace datasketches {
34
+
35
+ /*
36
+ * Based on Java implementation here:
37
+ * https://github.com/DataSketches/sketches-core/blob/master/src/main/java/com/yahoo/sketches/frequencies/ItemsSketch.java
38
+ * author Alexander Saydakov
39
+ */
40
+
41
+ enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES };
42
+
43
+ // for serialization as raw bytes
44
+ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
45
+ template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
46
+
47
+ // type W for weight must be an arithmetic type (integral or floating point)
48
+ template<typename T, typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename S = serde<T>, typename A = std::allocator<T>>
49
+ class frequent_items_sketch {
50
+ public:
51
+
52
+ /**
53
+ * Construct this sketch with parameters lg_max_map_size and lg_start_map_size.
54
+ *
55
+ * @param lg_max_map_size Log2 of the physical size of the internal hash map managed by this
56
+ * sketch. The maximum capacity of this internal hash map is 0.75 times 2^lg_max_map_size.
57
+ * Both the ultimate accuracy and size of this sketch are functions of lg_max_map_size.
58
+ *
59
+ * @param lg_start_map_size Log2 of the starting physical size of the internal hash
60
+ * map managed by this sketch.
61
+ */
62
+ explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE);
63
+
64
+ /**
65
+ * Update this sketch with an item and a positive weight (frequency count).
66
+ * @param item for which the weight should be increased (lvalue)
67
+ * @param weight the amount by which the weight of the item should be increased
68
+ * A count of zero is a no-op, and a negative count will throw an exception.
69
+ */
70
+ void update(const T& item, W weight = 1);
71
+
72
+ /**
73
+ * Update this sketch with an item and a positive weight (frequency count).
74
+ * @param item for which the weight should be increased (rvalue)
75
+ * @param weight the amount by which the weight of the item should be increased
76
+ * A count of zero is a no-op, and a negative count will throw an exception.
77
+ */
78
+ void update(T&& item, W weight = 1);
79
+
80
+ /**
81
+ * This function merges the other sketch into this one.
82
+ * The other sketch may be of a different size.
83
+ * @param other sketch to be merged into this (lvalue)
84
+ */
85
+ void merge(const frequent_items_sketch& other);
86
+
87
+ /**
88
+ * This function merges the other sketch into this one.
89
+ * The other sketch may be of a different size.
90
+ * @param other sketch to be merged into this (rvalue)
91
+ */
92
+ void merge(frequent_items_sketch&& other);
93
+
94
+ /**
95
+ * @return true if this sketch is empty
96
+ */
97
+ bool is_empty() const;
98
+
99
+ /**
100
+ * @return the number of active items in the sketch
101
+ */
102
+ uint32_t get_num_active_items() const;
103
+
104
+ /**
105
+ * Returns the sum of the weights (frequencies) in the stream seen so far by the sketch
106
+ *
107
+ * @return the total weight of all items in the stream seen so far by the sketch
108
+ */
109
+ W get_total_weight() const;
110
+
111
+ /**
112
+ * Returns the estimate of the weight (frequency) of the given item.
113
+ * Note: The true frequency of a item would be the sum of the counts as a result of the
114
+ * two update functions.
115
+ *
116
+ * @param item the given item
117
+ * @return the estimate of the weight (frequency) of the given item
118
+ */
119
+ W get_estimate(const T& item) const;
120
+
121
+ /**
122
+ * Returns the guaranteed lower bound weight (frequency) of the given item.
123
+ *
124
+ * @param item the given item.
125
+ * @return the guaranteed lower bound weight of the given item. That is, a number which
126
+ * is guaranteed to be no larger than the real weight.
127
+ */
128
+ W get_lower_bound(const T& item) const;
129
+
130
+ /**
131
+ * Returns the guaranteed upper bound weight (frequency) of the given item.
132
+ *
133
+ * @param item the given item
134
+ * @return the guaranteed upper bound weight of the given item. That is, a number which
135
+ * is guaranteed to be no smaller than the real frequency.
136
+ */
137
+ W get_upper_bound(const T& item) const;
138
+
139
+ /**
140
+ * @return An upper bound on the maximum error of get_estimate(item) for any item.
141
+ * This is equivalent to the maximum distance between the upper bound and the lower bound
142
+ * for any item.
143
+ */
144
+ W get_maximum_error() const;
145
+
146
+ /**
147
+ * Returns epsilon value of this sketch.
148
+ * This is just the value <i>3.5 / max_map_size</i>.
149
+ * @return epsilon used by the sketch to compute error.
150
+ */
151
+ double get_epsilon() const;
152
+
153
+ /**
154
+ * Returns epsilon used to compute <i>a priori</i> error.
155
+ * This is just the value <i>3.5 / maxMapSize</i>.
156
+ * @param maxMapSize the planned map size to be used when constructing this sketch.
157
+ * @return epsilon used to compute <i>a priori</i> error.
158
+ */
159
+ static double get_epsilon(uint8_t lg_max_map_size);
160
+
161
+ /**
162
+ * Returns the estimated <i>a priori</i> error given the max_map_size for the sketch and the
163
+ * estimated_total_stream_weight.
164
+ * @param lg_max_map_size the planned map size to be used when constructing this sketch.
165
+ * @param estimated_total_stream_weight the estimated total stream weight.
166
+ * @return the estimated <i>a priori</i> error.
167
+ */
168
+ static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight);
169
+
170
+ class row;
171
+ typedef typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>> vector_row; // alias for users
172
+
173
+ /**
174
+ * Returns an array of rows that include frequent items, estimates, upper and lower bounds
175
+ * given an error_type and using get_maximum_error() as a threshold.
176
+ *
177
+ * <p>The method first examines all active items in the sketch (items that have a counter).
178
+ *
179
+ * <p>If <i>error_type = NO_FALSE_NEGATIVES</i>, this will include an item in the result
180
+ * list if get_upper_bound(item) &gt; threshold.
181
+ * There will be no false negatives, i.e., no Type II error.
182
+ * There may be items in the set with true frequencies less than the threshold
183
+ * (false positives).</p>
184
+ *
185
+ * <p>If <i>error_type = NO_FALSE_POSITIVES</i>, this will include an item in the result
186
+ * list if get_lower_bound(item) &gt; threshold.
187
+ * There will be no false positives, i.e., no Type I error.
188
+ * There may be items omitted from the set with true frequencies greater than the
189
+ * threshold (false negatives).</p>
190
+ *
191
+ * @param error_type determines whether no false positives or no false negatives are desired.
192
+ * @return an array of frequent items
193
+ */
194
+ vector_row get_frequent_items(frequent_items_error_type err_type) const;
195
+
196
+ /**
197
+ * Returns an array of rows that include frequent items, estimates, upper and lower bounds
198
+ * given an error_type and a threshold.
199
+ *
200
+ * <p>The method first examines all active items in the sketch (items that have a counter).
201
+ *
202
+ * <p>If <i>error_type = NO_FALSE_NEGATIVES</i>, this will include an item in the result
203
+ * list if get_upper_bound(item) &gt; threshold.
204
+ * There will be no false negatives, i.e., no Type II error.
205
+ * There may be items in the set with true frequencies less than the threshold
206
+ * (false positives).</p>
207
+ *
208
+ * <p>If <i>error_type = NO_FALSE_POSITIVES</i>, this will include an item in the result
209
+ * list if get_lower_bound(item) &gt; threshold.
210
+ * There will be no false positives, i.e., no Type I error.
211
+ * There may be items omitted from the set with true frequencies greater than the
212
+ * threshold (false negatives).</p>
213
+ *
214
+ * @param error_type determines whether no false positives or no false negatives are desired.
215
+ * @param threshold to include items in the result list
216
+ * @return an array of frequent items
217
+ */
218
+ vector_row get_frequent_items(frequent_items_error_type err_type, W threshold) const;
219
+
220
+ /**
221
+ * Computes size needed to serialize the current state of the sketch.
222
+ * This can be expensive since every item needs to be looked at.
223
+ * @return size in bytes needed to serialize this sketch
224
+ */
225
+ size_t get_serialized_size_bytes() const;
226
+
227
+ /**
228
+ * This method serializes the sketch into a given stream in a binary form
229
+ * @param os output stream
230
+ */
231
+ void serialize(std::ostream& os) const;
232
+
233
+ // This is a convenience alias for users
234
+ // The type returned by the following serialize method
235
+ typedef vector_u8<A> vector_bytes;
236
+
237
+ /**
238
+ * This method serializes the sketch as a vector of bytes.
239
+ * An optional header can be reserved in front of the sketch.
240
+ * It is a blank space of a given size.
241
+ * This header is used in Datasketches PostgreSQL extension.
242
+ * @param header_size_bytes space to reserve in front of the sketch
243
+ * @return serialized sketch as a vector of bytes
244
+ */
245
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
246
+
247
+ /**
248
+ * This method deserializes a sketch from a given stream.
249
+ * @param is input stream
250
+ * @return an instance of the sketch
251
+ */
252
+ static frequent_items_sketch deserialize(std::istream& is);
253
+
254
+ /**
255
+ * This method deserializes a sketch from a given array of bytes.
256
+ * @param bytes pointer to the array of bytes
257
+ * @param size the size of the array
258
+ * @return an instance of the sketch
259
+ */
260
+ static frequent_items_sketch deserialize(const void* bytes, size_t size);
261
+
262
+ /**
263
+ * Returns a human readable summary of this sketch
264
+ * @param print_items if true include the list of items retained by the sketch
265
+ */
266
+ string<A> to_string(bool print_items = false) const;
267
+
268
+ private:
269
+ static const uint8_t LG_MIN_MAP_SIZE = 3;
270
+ static const uint8_t SERIAL_VERSION = 1;
271
+ static const uint8_t FAMILY_ID = 10;
272
+ static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
273
+ static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
274
+ static constexpr double EPSILON_FACTOR = 3.5;
275
+ enum flags { IS_EMPTY };
276
+ W total_weight;
277
+ W offset;
278
+ reverse_purge_hash_map<T, W, H, E, A> map;
279
+ static void check_preamble_longs(uint8_t preamble_longs, bool is_empty);
280
+ static void check_serial_version(uint8_t serial_version);
281
+ static void check_family_id(uint8_t family_id);
282
+ static void check_size(uint8_t lg_cur_size, uint8_t lg_max_size);
283
+
284
+ // version for integral signed type
285
+ template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type = 0>
286
+ static inline void check_weight(WW weight);
287
+
288
+ // version for integral unsigned type
289
+ template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type = 0>
290
+ static inline void check_weight(WW weight);
291
+
292
+ // version for floating point type
293
+ template<typename WW = W, typename std::enable_if<std::is_floating_point<WW>::value, int>::type = 0>
294
+ static inline void check_weight(WW weight);
295
+
296
+ // for deserialize
297
+ class items_deleter;
298
+ };
299
+
300
+ template<typename T, typename W, typename H, typename E, typename S, typename A>
301
+ class frequent_items_sketch<T, W, H, E, S, A>::row {
302
+ public:
303
+ row(const T* item, W weight, W offset):
304
+ item(item), weight(weight), offset(offset) {}
305
+ const T& get_item() const { return *item; }
306
+ W get_estimate() const { return weight + offset; }
307
+ W get_lower_bound() const { return weight; }
308
+ W get_upper_bound() const { return weight + offset; }
309
+ private:
310
+ const T* item;
311
+ W weight;
312
+ W offset;
313
+ };
314
+
315
+ }
316
+
317
+ #include "frequent_items_sketch_impl.hpp"
318
+
319
+ # endif