datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,135 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef BOUNDS_ON_RATIOS_IN_THETA_SKETCHED_SETS_HPP_
21
+ #define BOUNDS_ON_RATIOS_IN_THETA_SKETCHED_SETS_HPP_
22
+
23
+ #include <cstdint>
24
+ #include <stdexcept>
25
+
26
+ #include <bounds_on_ratios_in_sampled_sets.hpp>
27
+
28
+ namespace datasketches {
29
+
30
+ /**
31
+ * This is to compute the bounds on the estimate of the ratio <i>B / A</i>, where:
32
+ * <ul>
33
+ * <li><i>A</i> is a Theta Sketch of population <i>PopA</i>.</li>
34
+ * <li><i>B</i> is a Theta Sketch of population <i>PopB</i> that is a subset of <i>A</i>,
35
+ * obtained by an intersection of <i>A</i> with some other Theta Sketch <i>C</i>,
36
+ * which acts like a predicate or selection clause.</li>
37
+ * <li>The estimate of the ratio <i>PopB/PopA</i> is
38
+ * estimate_of_b_over_a(<i>A, B</i>).</li>
39
+ * <li>The Upper Bound estimate on the ratio PopB/PopA is
40
+ * upper_bound_for_b_over_a(<i>A, B</i>).</li>
41
+ * <li>The Lower Bound estimate on the ratio PopB/PopA is
42
+ * lower_bound_for_b_over_a(<i>A, B</i>).</li>
43
+ * </ul>
44
+ * Note: The theta of <i>A</i> cannot be greater than the theta of <i>B</i>.
45
+ * If <i>B</i> is formed as an intersection of <i>A</i> and some other set <i>C</i>,
46
+ * then the theta of <i>B</i> is guaranteed to be less than or equal to the theta of <i>B</i>.
47
+ */
48
+ template<typename ExtractKey>
49
+ class bounds_on_ratios_in_theta_sketched_sets {
50
+ public:
51
+ /**
52
+ * Gets the approximate lower bound for B over A based on a 95% confidence interval
53
+ * @param sketchA the sketch A
54
+ * @param sketchB the sketch B
55
+ * @return the approximate lower bound for B over A
56
+ */
57
+ template<typename SketchA, typename SketchB>
58
+ static double lower_bound_for_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) {
59
+ const uint64_t theta64_a = sketch_a.get_theta64();
60
+ const uint64_t theta64_b = sketch_b.get_theta64();
61
+ check_thetas(theta64_a, theta64_b);
62
+
63
+ const uint64_t count_b = sketch_b.get_num_retained();
64
+ const uint64_t count_a = theta64_a == theta64_b
65
+ ? sketch_a.get_num_retained()
66
+ : count_less_than_theta64(sketch_a, theta64_b);
67
+
68
+ if (count_a == 0) return 0;
69
+ const double f = sketch_b.get_theta();
70
+ return bounds_on_ratios_in_sampled_sets::lower_bound_for_b_over_a(count_a, count_b, f);
71
+ }
72
+
73
+ /**
74
+ * Gets the approximate upper bound for B over A based on a 95% confidence interval
75
+ * @param sketchA the sketch A
76
+ * @param sketchB the sketch B
77
+ * @return the approximate upper bound for B over A
78
+ */
79
+ template<typename SketchA, typename SketchB>
80
+ static double upper_bound_for_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) {
81
+ const uint64_t theta64_a = sketch_a.get_theta64();
82
+ const uint64_t theta64_b = sketch_b.get_theta64();
83
+ check_thetas(theta64_a, theta64_b);
84
+
85
+ const uint64_t count_b = sketch_b.get_num_retained();
86
+ const uint64_t count_a = (theta64_a == theta64_b)
87
+ ? sketch_a.get_num_retained()
88
+ : count_less_than_theta64(sketch_a, theta64_b);
89
+
90
+ if (count_a == 0) return 1;
91
+ const double f = sketch_b.get_theta();
92
+ return bounds_on_ratios_in_sampled_sets::upper_bound_for_b_over_a(count_a, count_b, f);
93
+ }
94
+
95
+ /**
96
+ * Gets the estimate for B over A
97
+ * @param sketchA the sketch A
98
+ * @param sketchB the sketch B
99
+ * @return the estimate for B over A
100
+ */
101
+ template<typename SketchA, typename SketchB>
102
+ static double estimate_of_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) {
103
+ const uint64_t theta64_a = sketch_a.get_theta64();
104
+ const uint64_t theta64_b = sketch_b.get_theta64();
105
+ check_thetas(theta64_a, theta64_b);
106
+
107
+ const uint64_t count_b = sketch_b.get_num_retained();
108
+ const uint64_t count_a = (theta64_a == theta64_b)
109
+ ? sketch_a.get_num_retained()
110
+ : count_less_than_theta64(sketch_a, theta64_b);
111
+
112
+ if (count_a == 0) return 0.5;
113
+ return static_cast<double>(count_b) / static_cast<double>(count_a);
114
+ }
115
+
116
+ private:
117
+
118
+ static inline void check_thetas(uint64_t theta_a, uint64_t theta_b) {
119
+ if (theta_b > theta_a) {
120
+ throw std::invalid_argument("theta_a must be <= theta_b");
121
+ }
122
+ }
123
+
124
+ template<typename Sketch>
125
+ static uint64_t count_less_than_theta64(const Sketch& sketch, uint64_t theta) {
126
+ uint64_t count = 0;
127
+ for (const auto& entry: sketch) if (ExtractKey()(entry) < theta) ++count;
128
+ return count;
129
+ }
130
+
131
+ };
132
+
133
+ } /* namespace datasketches */
134
+
135
+ # endif
@@ -0,0 +1,172 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef JACCARD_SIMILARITY_BASE_HPP_
21
+ #define JACCARD_SIMILARITY_BASE_HPP_
22
+
23
+ #include <memory>
24
+ #include <array>
25
+
26
+ #include <theta_union_experimental.hpp>
27
+ #include <theta_intersection_experimental.hpp>
28
+ #include <tuple_union.hpp>
29
+ #include <tuple_intersection.hpp>
30
+ #include <bounds_on_ratios_in_theta_sketched_sets.hpp>
31
+ #include <ceiling_power_of_2.hpp>
32
+ #include <common_defs.hpp>
33
+
34
+ namespace datasketches {
35
+
36
+ template<typename Union, typename Intersection, typename ExtractKey>
37
+ class jaccard_similarity_base {
38
+ public:
39
+
40
+ /**
41
+ * Computes the Jaccard similarity index with upper and lower bounds. The Jaccard similarity index
42
+ * <i>J(A,B) = (A ^ B)/(A U B)</i> is used to measure how similar the two sketches are to each
43
+ * other. If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are
44
+ * disjoint. A Jaccard of .95 means the overlap between the two
45
+ * sets is 95% of the union of the two sets.
46
+ *
47
+ * <p>Note: For very large pairs of sketches, where the configured nominal entries of the sketches
48
+ * are 2^25 or 2^26, this method may produce unpredictable results.
49
+ *
50
+ * @param sketch_a given sketch A
51
+ * @param sketch_b given sketch B
52
+ * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
53
+ * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
54
+ */
55
+ template<typename SketchA, typename SketchB>
56
+ static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
57
+ if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
58
+ if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
59
+ if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
60
+
61
+ auto union_ab = compute_union(sketch_a, sketch_b);
62
+ if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
63
+
64
+ // intersection
65
+ Intersection i;
66
+ i.update(sketch_a);
67
+ i.update(sketch_b);
68
+ i.update(union_ab); // ensures that intersection is a subset of the union
69
+ auto inter_abu = i.get_result(false);
70
+
71
+ return {
72
+ bounds_on_ratios_in_theta_sketched_sets<ExtractKey>::lower_bound_for_b_over_a(union_ab, inter_abu),
73
+ bounds_on_ratios_in_theta_sketched_sets<ExtractKey>::estimate_of_b_over_a(union_ab, inter_abu),
74
+ bounds_on_ratios_in_theta_sketched_sets<ExtractKey>::upper_bound_for_b_over_a(union_ab, inter_abu)
75
+ };
76
+ }
77
+
78
+ /**
79
+ * Returns true if the two given sketches are equivalent.
80
+ * @param sketch_a the given sketch A
81
+ * @param sketch_b the given sketch B
82
+ * @return true if the two given sketches are exactly equal
83
+ */
84
+ template<typename SketchA, typename SketchB>
85
+ static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
86
+ if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
87
+ if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
88
+ if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
89
+
90
+ auto union_ab = compute_union(sketch_a, sketch_b);
91
+ if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
92
+ return false;
93
+ }
94
+
95
+ /**
96
+ * Tests similarity of an actual Sketch against an expected Sketch.
97
+ * Computes the lower bound of the Jaccard index <i>J<sub>LB</sub></i> of the actual and
98
+ * expected sketches.
99
+ * if <i>J<sub>LB</sub> &ge; threshold</i>, then the sketches are considered to be
100
+ * similar with a confidence of 97.7%.
101
+ *
102
+ * @param actual the sketch to be tested
103
+ * @param expected the reference sketch that is considered to be correct
104
+ * @param threshold a real value between zero and one
105
+ * @return true if the similarity of the two sketches is greater than the given threshold
106
+ * with at least 97.7% confidence
107
+ */
108
+ template<typename SketchA, typename SketchB>
109
+ static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
110
+ auto jc = jaccard(actual, expected);
111
+ return jc[0] >= threshold;
112
+ }
113
+
114
+ /**
115
+ * Tests dissimilarity of an actual Sketch against an expected Sketch.
116
+ * Computes the upper bound of the Jaccard index <i>J<sub>UB</sub></i> of the actual and
117
+ * expected sketches.
118
+ * if <i>J<sub>UB</sub> &le; threshold</i>, then the sketches are considered to be
119
+ * dissimilar with a confidence of 97.7%.
120
+ *
121
+ * @param actual the sketch to be tested
122
+ * @param expected the reference sketch that is considered to be correct
123
+ * @param threshold a real value between zero and one
124
+ * @return true if the dissimilarity of the two sketches is greater than the given threshold
125
+ * with at least 97.7% confidence
126
+ */
127
+ template<typename SketchA, typename SketchB>
128
+ static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
129
+ auto jc = jaccard(actual, expected);
130
+ return jc[2] <= threshold;
131
+ }
132
+
133
+ private:
134
+
135
+ template<typename SketchA, typename SketchB>
136
+ static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
137
+ const unsigned count_a = sketch_a.get_num_retained();
138
+ const unsigned count_b = sketch_b.get_num_retained();
139
+ const unsigned lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
140
+ auto u = typename Union::builder().set_lg_k(lg_k).build();
141
+ u.update(sketch_a);
142
+ u.update(sketch_b);
143
+ return u.get_result(false);
144
+ }
145
+
146
+ template<typename SketchA, typename SketchB, typename UnionAB>
147
+ static bool identical_sets(const SketchA& sketch_a, const SketchB& sketch_b, const UnionAB& union_ab) {
148
+ if (union_ab.get_num_retained() == sketch_a.get_num_retained() &&
149
+ union_ab.get_num_retained() == sketch_b.get_num_retained() &&
150
+ union_ab.get_theta64() == sketch_a.get_theta64() &&
151
+ union_ab.get_theta64() == sketch_b.get_theta64()) return true;
152
+ return false;
153
+ }
154
+
155
+ };
156
+
157
+ template<typename Allocator>
158
+ using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_experimental<Allocator>, theta_intersection_experimental<Allocator>, trivial_extract_key>;
159
+
160
+ // alias with default allocator for convenience
161
+ using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
162
+
163
+ template<
164
+ typename Summary,
165
+ typename IntersectionPolicy,
166
+ typename UnionPolicy = default_union_policy<Summary>,
167
+ typename Allocator = std::allocator<Summary>>
168
+ using tuple_jaccard_similarity = jaccard_similarity_base<tuple_union<Summary, UnionPolicy, Allocator>, tuple_intersection<Summary, IntersectionPolicy, Allocator>, pair_extract_key<uint64_t, Summary>>;
169
+
170
+ } /* namespace datasketches */
171
+
172
+ # endif
@@ -0,0 +1,53 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_A_NOT_B_EXPERIMENTAL_HPP_
21
+ #define THETA_A_NOT_B_EXPERIMENTAL_HPP_
22
+
23
+ #include "theta_sketch_experimental.hpp"
24
+ #include "theta_set_difference_base.hpp"
25
+
26
+ namespace datasketches {
27
+
28
+ template<typename Allocator = std::allocator<uint64_t>>
29
+ class theta_a_not_b_experimental {
30
+ public:
31
+ using Entry = uint64_t;
32
+ using ExtractKey = trivial_extract_key;
33
+ using CompactSketch = compact_theta_sketch_experimental<Allocator>;
34
+ using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
35
+
36
+ explicit theta_a_not_b_experimental(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
37
+
38
+ /**
39
+ * Computes the a-not-b set operation given two sketches.
40
+ * @return the result of a-not-b
41
+ */
42
+ template<typename FwdSketch, typename Sketch>
43
+ CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
44
+
45
+ private:
46
+ State state_;
47
+ };
48
+
49
+ } /* namespace datasketches */
50
+
51
+ #include "theta_a_not_b_experimental_impl.hpp"
52
+
53
+ #endif
@@ -0,0 +1,33 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ namespace datasketches {
21
+
22
+ template<typename A>
23
+ theta_a_not_b_experimental<A>::theta_a_not_b_experimental(uint64_t seed, const A& allocator):
24
+ state_(seed, allocator)
25
+ {}
26
+
27
+ template<typename A>
28
+ template<typename FwdSketch, typename Sketch>
29
+ auto theta_a_not_b_experimental<A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const -> CompactSketch {
30
+ return state_.compute(std::forward<FwdSketch>(a), b, ordered);
31
+ }
32
+
33
+ } /* namespace datasketches */
@@ -0,0 +1,48 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_COMPARATORS_HPP_
21
+ #define THETA_COMPARATORS_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ template<typename ExtractKey>
26
+ struct compare_by_key {
27
+ template<typename Entry1, typename Entry2>
28
+ bool operator()(Entry1&& a, Entry2&& b) const {
29
+ return ExtractKey()(std::forward<Entry1>(a)) < ExtractKey()(std::forward<Entry2>(b));
30
+ }
31
+ };
32
+
33
+ // less than
34
+
35
+ template<typename Key, typename Entry, typename ExtractKey>
36
+ class key_less_than {
37
+ public:
38
+ explicit key_less_than(const Key& key): key(key) {}
39
+ bool operator()(const Entry& entry) const {
40
+ return ExtractKey()(entry) < this->key;
41
+ }
42
+ private:
43
+ Key key;
44
+ };
45
+
46
+ } /* namespace datasketches */
47
+
48
+ #endif
@@ -0,0 +1,34 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef THETA_CONSTANTS_HPP_
21
+ #define THETA_CONSTANTS_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ namespace theta_constants {
26
+ enum resize_factor { X1, X2, X4, X8 };
27
+ static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
28
+ static const uint8_t MIN_LG_K = 5;
29
+ static const uint8_t MAX_LG_K = 26;
30
+ }
31
+
32
+ } /* namespace datasketches */
33
+
34
+ #endif