datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,144 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <iostream>
21
+
22
+ #include <catch.hpp>
23
+ #include <jaccard_similarity.hpp>
24
+
25
+ namespace datasketches {
26
+
27
+ using update_theta_sketch = update_theta_sketch_experimental<>;
28
+
29
+ TEST_CASE("theta jaccard: empty", "[theta_sketch]") {
30
+ auto sk_a = update_theta_sketch::builder().build();
31
+ auto sk_b = update_theta_sketch::builder().build();
32
+ auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b);
33
+ REQUIRE(jc == std::array<double, 3>{1, 1, 1});
34
+ }
35
+
36
+ TEST_CASE("theta jaccard: same sketch exact mode", "[theta_sketch]") {
37
+ auto sk = update_theta_sketch::builder().build();
38
+ for (int i = 0; i < 1000; ++i) sk.update(i);
39
+
40
+ // update sketch
41
+ auto jc = theta_jaccard_similarity::jaccard(sk, sk);
42
+ REQUIRE(jc == std::array<double, 3>{1, 1, 1});
43
+
44
+ // compact sketch
45
+ jc = theta_jaccard_similarity::jaccard(sk.compact(), sk.compact());
46
+ REQUIRE(jc == std::array<double, 3>{1, 1, 1});
47
+ }
48
+
49
+ TEST_CASE("theta jaccard: full overlap exact mode", "[theta_sketch]") {
50
+ auto sk_a = update_theta_sketch::builder().build();
51
+ auto sk_b = update_theta_sketch::builder().build();
52
+ for (int i = 0; i < 1000; ++i) {
53
+ sk_a.update(i);
54
+ sk_b.update(i);
55
+ }
56
+
57
+ // update sketches
58
+ auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b);
59
+ REQUIRE(jc == std::array<double, 3>{1, 1, 1});
60
+
61
+ // compact sketches
62
+ jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact());
63
+ REQUIRE(jc == std::array<double, 3>{1, 1, 1});
64
+ }
65
+
66
+ TEST_CASE("theta jaccard: disjoint exact mode", "[theta_sketch]") {
67
+ auto sk_a = update_theta_sketch::builder().build();
68
+ auto sk_b = update_theta_sketch::builder().build();
69
+ for (int i = 0; i < 1000; ++i) {
70
+ sk_a.update(i);
71
+ sk_b.update(i + 1000);
72
+ }
73
+
74
+ // update sketches
75
+ auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b);
76
+ REQUIRE(jc == std::array<double, 3>{0, 0, 0});
77
+
78
+ // compact sketches
79
+ jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact());
80
+ REQUIRE(jc == std::array<double, 3>{0, 0, 0});
81
+ }
82
+
83
+ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
84
+ auto sk_a = update_theta_sketch::builder().build();
85
+ auto sk_b = update_theta_sketch::builder().build();
86
+ for (int i = 0; i < 10000; ++i) {
87
+ sk_a.update(i);
88
+ sk_b.update(i + 5000);
89
+ }
90
+
91
+ // update sketches
92
+ auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b);
93
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
94
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
95
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
96
+
97
+ // compact sketches
98
+ jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact());
99
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
100
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
101
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
102
+ }
103
+
104
+ /**
105
+ * The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
106
+ * underlying sketch is about +/- 1.56%.
107
+ */
108
+ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
109
+ const int8_t min_lg_k = 12;
110
+ const int u1 = 1 << 20;
111
+ const int u2 = u1 * 0.95;
112
+ const double threshold = 0.943;
113
+
114
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
115
+ for (int i = 0; i < u1; ++i) expected.update(i);
116
+
117
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
118
+ for (int i = 0; i < u2; ++i) actual.update(i);
119
+
120
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold));
121
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
122
+ }
123
+
124
+ /**
125
+ * The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
126
+ * as the ratio of intersection to the union becomes a small number.
127
+ */
128
+ TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
129
+ const int8_t min_lg_k = 12;
130
+ const int u1 = 1 << 20;
131
+ const int u2 = u1 * 0.05;
132
+ const double threshold = 0.061;
133
+
134
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
135
+ for (int i = 0; i < u1; ++i) expected.update(i);
136
+
137
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
138
+ for (int i = 0; i < u2; ++i) actual.update(i);
139
+
140
+ REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold));
141
+ REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
142
+ }
143
+
144
+ } /* namespace datasketches */
@@ -0,0 +1,247 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <fstream>
21
+ #include <sstream>
22
+
23
+ #include <catch.hpp>
24
+ #include <theta_sketch_experimental.hpp>
25
+
26
+ namespace datasketches {
27
+
28
+ #ifdef TEST_BINARY_INPUT_PATH
29
+ const std::string inputPath = TEST_BINARY_INPUT_PATH;
30
+ #else
31
+ const std::string inputPath = "test/";
32
+ #endif
33
+
34
+ // These tests have been copied from the existing theta sketch implementation.
35
+ // Serialization as base class and serialization of update sketch have been removed.
36
+
37
+ using update_theta_sketch = update_theta_sketch_experimental<>;
38
+ using compact_theta_sketch = compact_theta_sketch_experimental<>;
39
+
40
+ TEST_CASE("theta sketch: empty", "[theta_sketch]") {
41
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
42
+ REQUIRE(update_sketch.is_empty());
43
+ REQUIRE_FALSE(update_sketch.is_estimation_mode());
44
+ REQUIRE(update_sketch.get_theta() == 1.0);
45
+ REQUIRE(update_sketch.get_estimate() == 0.0);
46
+ REQUIRE(update_sketch.get_lower_bound(1) == 0.0);
47
+ REQUIRE(update_sketch.get_upper_bound(1) == 0.0);
48
+
49
+ compact_theta_sketch compact_sketch = update_sketch.compact();
50
+ REQUIRE(compact_sketch.is_empty());
51
+ REQUIRE_FALSE(compact_sketch.is_estimation_mode());
52
+ REQUIRE(compact_sketch.get_theta() == 1.0);
53
+ REQUIRE(compact_sketch.get_estimate() == 0.0);
54
+ REQUIRE(compact_sketch.get_lower_bound(1) == 0.0);
55
+ REQUIRE(compact_sketch.get_upper_bound(1) == 0.0);
56
+ }
57
+
58
+ TEST_CASE("theta sketch: non empty no retained keys", "[theta_sketch]") {
59
+ update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001).build();
60
+ update_sketch.update(1);
61
+ //std::cerr << update_sketch.to_string();
62
+ REQUIRE(update_sketch.get_num_retained() == 0);
63
+ REQUIRE_FALSE(update_sketch.is_empty());
64
+ REQUIRE(update_sketch.is_estimation_mode());
65
+ REQUIRE(update_sketch.get_estimate() == 0.0);
66
+ REQUIRE(update_sketch.get_lower_bound(1) == 0.0);
67
+ REQUIRE(update_sketch.get_upper_bound(1) > 0);
68
+
69
+ compact_theta_sketch compact_sketch = update_sketch.compact();
70
+ REQUIRE(compact_sketch.get_num_retained() == 0);
71
+ REQUIRE_FALSE(compact_sketch.is_empty());
72
+ REQUIRE(compact_sketch.is_estimation_mode());
73
+ REQUIRE(compact_sketch.get_estimate() == 0.0);
74
+ REQUIRE(compact_sketch.get_lower_bound(1) == 0.0);
75
+ REQUIRE(compact_sketch.get_upper_bound(1) > 0);
76
+ }
77
+
78
+ TEST_CASE("theta sketch: single item", "[theta_sketch]") {
79
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
80
+ update_sketch.update(1);
81
+ REQUIRE_FALSE(update_sketch.is_empty());
82
+ REQUIRE_FALSE(update_sketch.is_estimation_mode());
83
+ REQUIRE(update_sketch.get_theta() == 1.0);
84
+ REQUIRE(update_sketch.get_estimate() == 1.0);
85
+ REQUIRE(update_sketch.get_lower_bound(1) == 1.0);
86
+ REQUIRE(update_sketch.get_upper_bound(1) == 1.0);
87
+
88
+ compact_theta_sketch compact_sketch = update_sketch.compact();
89
+ REQUIRE_FALSE(compact_sketch.is_empty());
90
+ REQUIRE_FALSE(compact_sketch.is_estimation_mode());
91
+ REQUIRE(compact_sketch.get_theta() == 1.0);
92
+ REQUIRE(compact_sketch.get_estimate() == 1.0);
93
+ REQUIRE(compact_sketch.get_lower_bound(1) == 1.0);
94
+ REQUIRE(compact_sketch.get_upper_bound(1) == 1.0);
95
+ }
96
+
97
+ TEST_CASE("theta sketch: resize exact", "[theta_sketch]") {
98
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
99
+ for (int i = 0; i < 2000; i++) update_sketch.update(i);
100
+ REQUIRE_FALSE(update_sketch.is_empty());
101
+ REQUIRE_FALSE(update_sketch.is_estimation_mode());
102
+ REQUIRE(update_sketch.get_theta() == 1.0);
103
+ REQUIRE(update_sketch.get_estimate() == 2000.0);
104
+ REQUIRE(update_sketch.get_lower_bound(1) == 2000.0);
105
+ REQUIRE(update_sketch.get_upper_bound(1) == 2000.0);
106
+
107
+ compact_theta_sketch compact_sketch = update_sketch.compact();
108
+ REQUIRE_FALSE(compact_sketch.is_empty());
109
+ REQUIRE_FALSE(compact_sketch.is_estimation_mode());
110
+ REQUIRE(compact_sketch.get_theta() == 1.0);
111
+ REQUIRE(compact_sketch.get_estimate() == 2000.0);
112
+ REQUIRE(compact_sketch.get_lower_bound(1) == 2000.0);
113
+ REQUIRE(compact_sketch.get_upper_bound(1) == 2000.0);
114
+ }
115
+
116
+ TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
117
+ update_theta_sketch update_sketch = update_theta_sketch::builder().set_resize_factor(update_theta_sketch::resize_factor::X1).build();
118
+ const int n = 8000;
119
+ for (int i = 0; i < n; i++) update_sketch.update(i);
120
+ //std::cerr << update_sketch.to_string();
121
+ REQUIRE_FALSE(update_sketch.is_empty());
122
+ REQUIRE(update_sketch.is_estimation_mode());
123
+ REQUIRE(update_sketch.get_theta() < 1.0);
124
+ REQUIRE(update_sketch.get_estimate() == Approx((double) n).margin(n * 0.01));
125
+ REQUIRE(update_sketch.get_lower_bound(1) < n);
126
+ REQUIRE(update_sketch.get_upper_bound(1) > n);
127
+
128
+ const uint32_t k = 1 << update_theta_sketch::builder::DEFAULT_LG_K;
129
+ REQUIRE(update_sketch.get_num_retained() >= k);
130
+ update_sketch.trim();
131
+ REQUIRE(update_sketch.get_num_retained() == k);
132
+
133
+ compact_theta_sketch compact_sketch = update_sketch.compact();
134
+ REQUIRE_FALSE(compact_sketch.is_empty());
135
+ REQUIRE(compact_sketch.is_ordered());
136
+ REQUIRE(compact_sketch.is_estimation_mode());
137
+ REQUIRE(compact_sketch.get_theta() < 1.0);
138
+ REQUIRE(compact_sketch.get_estimate() == Approx((double) n).margin(n * 0.01));
139
+ REQUIRE(compact_sketch.get_lower_bound(1) < n);
140
+ REQUIRE(compact_sketch.get_upper_bound(1) > n);
141
+ }
142
+
143
+ TEST_CASE("theta sketch: deserialize compact empty from java", "[theta_sketch]") {
144
+ std::ifstream is;
145
+ is.exceptions(std::ios::failbit | std::ios::badbit);
146
+ is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
147
+ auto sketch = compact_theta_sketch::deserialize(is);
148
+ REQUIRE(sketch.is_empty());
149
+ REQUIRE_FALSE(sketch.is_estimation_mode());
150
+ REQUIRE(sketch.get_num_retained() == 0);
151
+ REQUIRE(sketch.get_theta() == 1.0);
152
+ REQUIRE(sketch.get_estimate() == 0.0);
153
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
154
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
155
+ }
156
+
157
+ TEST_CASE("theta sketch: deserialize single item from java", "[theta_sketch]") {
158
+ std::ifstream is;
159
+ is.exceptions(std::ios::failbit | std::ios::badbit);
160
+ is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
161
+ auto sketch = compact_theta_sketch::deserialize(is);
162
+ REQUIRE_FALSE(sketch.is_empty());
163
+ REQUIRE_FALSE(sketch.is_estimation_mode());
164
+ REQUIRE(sketch.get_num_retained() == 1);
165
+ REQUIRE(sketch.get_theta() == 1.0);
166
+ REQUIRE(sketch.get_estimate() == 1.0);
167
+ REQUIRE(sketch.get_lower_bound(1) == 1.0);
168
+ REQUIRE(sketch.get_upper_bound(1) == 1.0);
169
+ }
170
+
171
+ TEST_CASE("theta sketch: deserialize compact estimation from java", "[theta_sketch]") {
172
+ std::ifstream is;
173
+ is.exceptions(std::ios::failbit | std::ios::badbit);
174
+ is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
175
+ auto sketch = compact_theta_sketch::deserialize(is);
176
+ REQUIRE_FALSE(sketch.is_empty());
177
+ REQUIRE(sketch.is_estimation_mode());
178
+ REQUIRE(sketch.is_ordered());
179
+ REQUIRE(sketch.get_num_retained() == 4342);
180
+ REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
181
+ REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
182
+ REQUIRE(sketch.get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
183
+ REQUIRE(sketch.get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
184
+
185
+ // the same construction process in Java must have produced exactly the same sketch
186
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
187
+ const int n = 8192;
188
+ for (int i = 0; i < n; i++) update_sketch.update(i);
189
+ REQUIRE(sketch.get_num_retained() == update_sketch.get_num_retained());
190
+ REQUIRE(sketch.get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
191
+ REQUIRE(sketch.get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
192
+ REQUIRE(sketch.get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
193
+ REQUIRE(sketch.get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
194
+ REQUIRE(sketch.get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
195
+ REQUIRE(sketch.get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
196
+ REQUIRE(sketch.get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
197
+ REQUIRE(sketch.get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
198
+ compact_theta_sketch compact_sketch = update_sketch.compact();
199
+ // the sketches are ordered, so the iteration sequence must match exactly
200
+ auto iter = sketch.begin();
201
+ for (const auto& key: compact_sketch) {
202
+ REQUIRE(*iter == key);
203
+ ++iter;
204
+ }
205
+ }
206
+
207
+ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[theta_sketch]") {
208
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
209
+ const int n = 8192;
210
+ for (int i = 0; i < n; i++) update_sketch.update(i);
211
+
212
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
213
+ update_sketch.compact().serialize(s);
214
+ auto bytes = update_sketch.compact().serialize();
215
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
216
+ for (size_t i = 0; i < bytes.size(); ++i) {
217
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
218
+ }
219
+
220
+ s.seekg(0); // rewind
221
+ compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
222
+ compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
223
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
224
+ REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
225
+ REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
226
+ REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
227
+ REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
228
+ REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
229
+ REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
230
+ REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
231
+ // the sketches are ordered, so the iteration sequence must match exactly
232
+ auto iter = deserialized_sketch1.begin();
233
+ for (auto key: deserialized_sketch2) {
234
+ REQUIRE(*iter == key);
235
+ ++iter;
236
+ }
237
+ }
238
+
239
+ TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") {
240
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
241
+ update_sketch.update(1);
242
+ auto bytes = update_sketch.compact().serialize();
243
+ REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
244
+ REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
245
+ }
246
+
247
+ } /* namespace datasketches */
@@ -0,0 +1,44 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <iostream>
21
+
22
+ #include <catch.hpp>
23
+ #include <tuple_union.hpp>
24
+
25
+ #include <theta_union_experimental.hpp>
26
+
27
+ namespace datasketches {
28
+
29
+ TEST_CASE("theta_union_exeperimental") {
30
+ auto update_sketch1 = update_theta_sketch_experimental<>::builder().build();
31
+ update_sketch1.update(1);
32
+ update_sketch1.update(2);
33
+
34
+ auto update_sketch2 = update_theta_sketch_experimental<>::builder().build();
35
+ update_sketch2.update(1);
36
+ update_sketch2.update(3);
37
+
38
+ auto u = theta_union_experimental<>::builder().build();
39
+ u.update(update_sketch1);
40
+ u.update(update_sketch2);
41
+ auto r = u.get_result();
42
+ }
43
+
44
+ } /* namespace datasketches */
@@ -0,0 +1,289 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <iostream>
21
+
22
+ #include <catch.hpp>
23
+ #include <tuple_a_not_b.hpp>
24
+ #include <theta_sketch_experimental.hpp>
25
+
26
+ namespace datasketches {
27
+
28
+ TEST_CASE("tuple a-not-b: empty", "[tuple_a_not_b]") {
29
+ auto a = update_tuple_sketch<float>::builder().build();
30
+ auto b = update_tuple_sketch<float>::builder().build();
31
+ tuple_a_not_b<float> a_not_b;
32
+ auto result = a_not_b.compute(a, b);
33
+ REQUIRE(result.get_num_retained() == 0);
34
+ REQUIRE(result.is_empty());
35
+ REQUIRE_FALSE(result.is_estimation_mode());
36
+ REQUIRE(result.get_estimate() == 0.0);
37
+ }
38
+
39
+ TEST_CASE("tuple a-not-b: non empty no retained keys", "[tuple_a_not_b]") {
40
+ auto a = update_tuple_sketch<float>::builder().build();
41
+ a.update(1, 1);
42
+ auto b = update_tuple_sketch<float>::builder().set_p(0.001).build();
43
+ tuple_a_not_b<float> a_not_b;
44
+
45
+ // B is still empty
46
+ auto result = a_not_b.compute(a, b);
47
+ REQUIRE_FALSE(result.is_empty());
48
+ REQUIRE_FALSE(result.is_estimation_mode());
49
+ REQUIRE(result.get_num_retained() == 1);
50
+ REQUIRE(result.get_theta() == Approx(1).margin(1e-10));
51
+ REQUIRE(result.get_estimate() == 1.0);
52
+
53
+ // B is not empty in estimation mode and no entries
54
+ b.update(1, 1);
55
+ REQUIRE(b.get_num_retained() == 0);
56
+
57
+ result = a_not_b.compute(a, b);
58
+ REQUIRE_FALSE(result.is_empty());
59
+ REQUIRE(result.is_estimation_mode());
60
+ REQUIRE(result.get_num_retained() == 0);
61
+ REQUIRE(result.get_theta() == Approx(0.001).margin(1e-10));
62
+ REQUIRE(result.get_estimate() == 0.0);
63
+ }
64
+
65
+ TEST_CASE("tuple a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
66
+ auto a = update_tuple_sketch<float>::builder().build();
67
+ int value = 0;
68
+ for (int i = 0; i < 1000; i++) a.update(value++, 1);
69
+
70
+ auto b = update_tuple_sketch<float>::builder().build();
71
+ value = 500;
72
+ for (int i = 0; i < 1000; i++) b.update(value++, 1);
73
+
74
+ tuple_a_not_b<float> a_not_b;
75
+
76
+ // unordered inputs, ordered result
77
+ auto result = a_not_b.compute(a, b);
78
+ REQUIRE_FALSE(result.is_empty());
79
+ REQUIRE_FALSE(result.is_estimation_mode());
80
+ REQUIRE(result.is_ordered());
81
+ REQUIRE(result.get_estimate() == 500.0);
82
+
83
+ // unordered inputs, unordered result
84
+ result = a_not_b.compute(a, b, false);
85
+ REQUIRE_FALSE(result.is_empty());
86
+ REQUIRE_FALSE(result.is_estimation_mode());
87
+ REQUIRE_FALSE(result.is_ordered());
88
+ REQUIRE(result.get_estimate() == 500.0);
89
+
90
+ // ordered inputs
91
+ result = a_not_b.compute(a.compact(), b.compact());
92
+ REQUIRE_FALSE(result.is_empty());
93
+ REQUIRE_FALSE(result.is_estimation_mode());
94
+ REQUIRE(result.is_ordered());
95
+ REQUIRE(result.get_estimate() == 500.0);
96
+
97
+ // A is ordered, so the result is ordered regardless
98
+ result = a_not_b.compute(a.compact(), b, false);
99
+ REQUIRE_FALSE(result.is_empty());
100
+ REQUIRE_FALSE(result.is_estimation_mode());
101
+ REQUIRE(result.is_ordered());
102
+ REQUIRE(result.get_estimate() == 500.0);
103
+ }
104
+
105
+ // needed until promotion of experimental to replace existing theta sketch
106
+ using update_theta_sketch = update_theta_sketch_experimental<>;
107
+
108
+ TEST_CASE("mixed a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
109
+ auto a = update_tuple_sketch<float>::builder().build();
110
+ int value = 0;
111
+ for (int i = 0; i < 1000; i++) a.update(value++, 1);
112
+
113
+ auto b = update_theta_sketch::builder().build();
114
+ value = 500;
115
+ for (int i = 0; i < 1000; i++) b.update(value++);
116
+
117
+ tuple_a_not_b<float> a_not_b;
118
+
119
+ // unordered inputs, ordered result
120
+ auto result = a_not_b.compute(a, compact_tuple_sketch<float>(b, 1, false));
121
+ REQUIRE_FALSE(result.is_empty());
122
+ REQUIRE_FALSE(result.is_estimation_mode());
123
+ REQUIRE(result.is_ordered());
124
+ REQUIRE(result.get_estimate() == 500.0);
125
+
126
+ // unordered inputs, unordered result
127
+ result = a_not_b.compute(a, compact_tuple_sketch<float>(b, 1, false), false);
128
+ REQUIRE_FALSE(result.is_empty());
129
+ REQUIRE_FALSE(result.is_estimation_mode());
130
+ REQUIRE_FALSE(result.is_ordered());
131
+ REQUIRE(result.get_estimate() == 500.0);
132
+
133
+ // ordered inputs
134
+ result = a_not_b.compute(a.compact(), compact_tuple_sketch<float>(b.compact(), 1));
135
+ REQUIRE_FALSE(result.is_empty());
136
+ REQUIRE_FALSE(result.is_estimation_mode());
137
+ REQUIRE(result.is_ordered());
138
+ REQUIRE(result.get_estimate() == 500.0);
139
+
140
+ // A is ordered, so the result is ordered regardless
141
+ result = a_not_b.compute(a.compact(), compact_tuple_sketch<float>(b, 1, false), false);
142
+ REQUIRE_FALSE(result.is_empty());
143
+ REQUIRE_FALSE(result.is_estimation_mode());
144
+ REQUIRE(result.is_ordered());
145
+ REQUIRE(result.get_estimate() == 500.0);
146
+ }
147
+
148
+ TEST_CASE("tuple a-not-b: exact mode disjoint", "[tuple_a_not_b]") {
149
+ auto a = update_tuple_sketch<float>::builder().build();
150
+ int value = 0;
151
+ for (int i = 0; i < 1000; i++) a.update(value++, 1);
152
+
153
+ auto b = update_tuple_sketch<float>::builder().build();
154
+ for (int i = 0; i < 1000; i++) b.update(value++, 1);
155
+
156
+ tuple_a_not_b<float> a_not_b;
157
+
158
+ // unordered inputs
159
+ auto result = a_not_b.compute(a, b);
160
+ REQUIRE_FALSE(result.is_empty());
161
+ REQUIRE_FALSE(result.is_estimation_mode());
162
+ REQUIRE(result.get_estimate() == 1000.0);
163
+
164
+ // ordered inputs
165
+ result = a_not_b.compute(a.compact(), b.compact());
166
+ REQUIRE_FALSE(result.is_empty());
167
+ REQUIRE_FALSE(result.is_estimation_mode());
168
+ REQUIRE(result.get_estimate() == 1000.0);
169
+ }
170
+
171
+ TEST_CASE("tuple a-not-b: exact mode full overlap", "[tuple_a_not_b]") {
172
+ auto sketch = update_tuple_sketch<float>::builder().build();
173
+ int value = 0;
174
+ for (int i = 0; i < 1000; i++) sketch.update(value++, 1);
175
+
176
+ tuple_a_not_b<float> a_not_b;
177
+
178
+ // unordered inputs
179
+ auto result = a_not_b.compute(sketch, sketch);
180
+ REQUIRE(result.is_empty());
181
+ REQUIRE_FALSE(result.is_estimation_mode());
182
+ REQUIRE(result.get_estimate() == 0.0);
183
+
184
+ // ordered inputs
185
+ result = a_not_b.compute(sketch.compact(), sketch.compact());
186
+ REQUIRE(result.is_empty());
187
+ REQUIRE_FALSE(result.is_estimation_mode());
188
+ REQUIRE(result.get_estimate() == 0.0);
189
+ }
190
+
191
+ TEST_CASE("tuple a-not-b: estimation mode half overlap", "[tuple_a_not_b]") {
192
+ auto a = update_tuple_sketch<float>::builder().build();
193
+ int value = 0;
194
+ for (int i = 0; i < 10000; i++) a.update(value++, 1);
195
+
196
+ auto b = update_tuple_sketch<float>::builder().build();
197
+ value = 5000;
198
+ for (int i = 0; i < 10000; i++) b.update(value++, 1);
199
+
200
+ tuple_a_not_b<float> a_not_b;
201
+
202
+ // unordered inputs
203
+ auto result = a_not_b.compute(a, b);
204
+ REQUIRE_FALSE(result.is_empty());
205
+ REQUIRE(result.is_estimation_mode());
206
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
207
+
208
+ // ordered inputs
209
+ result = a_not_b.compute(a.compact(), b.compact());
210
+ REQUIRE_FALSE(result.is_empty());
211
+ REQUIRE(result.is_estimation_mode());
212
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
213
+ }
214
+
215
+ TEST_CASE("tuple a-not-b: estimation mode disjoint", "[tuple_a_not_b]") {
216
+ auto a = update_tuple_sketch<float>::builder().build();
217
+ int value = 0;
218
+ for (int i = 0; i < 10000; i++) a.update(value++, 1);
219
+
220
+ auto b = update_tuple_sketch<float>::builder().build();
221
+ for (int i = 0; i < 10000; i++) b.update(value++, 1);
222
+
223
+ tuple_a_not_b<float> a_not_b;
224
+
225
+ // unordered inputs
226
+ auto result = a_not_b.compute(a, b);
227
+ REQUIRE_FALSE(result.is_empty());
228
+ REQUIRE(result.is_estimation_mode());
229
+ REQUIRE(result.get_estimate() == Approx(10000).margin(10000 * 0.02));
230
+
231
+ // ordered inputs
232
+ result = a_not_b.compute(a.compact(), b.compact());
233
+ REQUIRE_FALSE(result.is_empty());
234
+ REQUIRE(result.is_estimation_mode());
235
+ REQUIRE(result.get_estimate() == Approx(10000).margin(10000 * 0.02));
236
+ }
237
+
238
+ TEST_CASE("tuple a-not-b: estimation mode full overlap", "[tuple_a_not_b]") {
239
+ auto sketch = update_tuple_sketch<float>::builder().build();
240
+ int value = 0;
241
+ for (int i = 0; i < 10000; i++) sketch.update(value++, 1);
242
+
243
+ tuple_a_not_b<float> a_not_b;
244
+
245
+ // unordered inputs
246
+ auto result = a_not_b.compute(sketch, sketch);
247
+ REQUIRE_FALSE(result.is_empty());
248
+ REQUIRE(result.is_estimation_mode());
249
+ REQUIRE(result.get_estimate() == 0.0);
250
+
251
+ // ordered inputs
252
+ result = a_not_b.compute(sketch.compact(), sketch.compact());
253
+ REQUIRE_FALSE(result.is_empty());
254
+ REQUIRE(result.is_estimation_mode());
255
+ REQUIRE(result.get_estimate() == 0.0);
256
+ }
257
+
258
+ TEST_CASE("tuple a-not-b: seed mismatch", "[tuple_a_not_b]") {
259
+ auto sketch = update_tuple_sketch<float>::builder().build();
260
+ sketch.update(1, 1); // non-empty should not be ignored
261
+ tuple_a_not_b<float> a_not_b(123);
262
+ REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
263
+ }
264
+
265
+ TEST_CASE("tuple a-not-b: issue #152", "[tuple_a_not_b]") {
266
+ auto a = update_tuple_sketch<float>::builder().build();
267
+ int value = 0;
268
+ for (int i = 0; i < 10000; i++) a.update(value++, 1);
269
+
270
+ auto b = update_tuple_sketch<float>::builder().build();
271
+ value = 5000;
272
+ for (int i = 0; i < 25000; i++) b.update(value++, 1);
273
+
274
+ tuple_a_not_b<float> a_not_b;
275
+
276
+ // unordered inputs
277
+ auto result = a_not_b.compute(a, b);
278
+ REQUIRE_FALSE(result.is_empty());
279
+ REQUIRE(result.is_estimation_mode());
280
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
281
+
282
+ // ordered inputs
283
+ result = a_not_b.compute(a.compact(), b.compact());
284
+ REQUIRE_FALSE(result.is_empty());
285
+ REQUIRE(result.is_estimation_mode());
286
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
287
+ }
288
+
289
+ } /* namespace datasketches */