datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,218 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <theta_intersection.hpp>
23
+
24
+ namespace datasketches {
25
+
26
+ TEST_CASE("theta intersection: invalid", "[theta_intersection]") {
27
+ theta_intersection intersection;
28
+ REQUIRE_FALSE(intersection.has_result());
29
+ REQUIRE_THROWS_AS(intersection.get_result(), std::invalid_argument);
30
+ }
31
+
32
+ TEST_CASE("theta intersection: empty", "[theta_intersection]") {
33
+ theta_intersection intersection;
34
+ update_theta_sketch sketch = update_theta_sketch::builder().build();
35
+ intersection.update(sketch);
36
+ compact_theta_sketch result = intersection.get_result();
37
+ REQUIRE(result.get_num_retained() == 0);
38
+ REQUIRE(result.is_empty());
39
+ REQUIRE_FALSE(result.is_estimation_mode());
40
+ REQUIRE(result.get_estimate() == 0.0);
41
+
42
+ intersection.update(sketch);
43
+ result = intersection.get_result();
44
+ REQUIRE(result.get_num_retained() == 0);
45
+ REQUIRE(result.is_empty());
46
+ REQUIRE_FALSE(result.is_estimation_mode());
47
+ REQUIRE(result.get_estimate() == 0.0);
48
+ }
49
+
50
+ TEST_CASE("theta intersection: non empty no retained keys", "[theta_intersection]") {
51
+ update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001).build();
52
+ sketch.update(1);
53
+ theta_intersection intersection;
54
+ intersection.update(sketch);
55
+ compact_theta_sketch result = intersection.get_result();
56
+ REQUIRE(result.get_num_retained() == 0);
57
+ REQUIRE_FALSE(result.is_empty());
58
+ REQUIRE(result.is_estimation_mode());
59
+ REQUIRE(result.get_theta() == Approx(0.001).margin(1e-10));
60
+ REQUIRE(result.get_estimate() == 0.0);
61
+
62
+ intersection.update(sketch);
63
+ result = intersection.get_result();
64
+ REQUIRE(result.get_num_retained() == 0);
65
+ REQUIRE_FALSE(result.is_empty());
66
+ REQUIRE(result.is_estimation_mode());
67
+ REQUIRE(result.get_theta() == Approx(0.001).margin(1e-10));
68
+ REQUIRE(result.get_estimate() == 0.0);
69
+ }
70
+
71
+ TEST_CASE("theta intersection: exact mode half overlap unordered", "[theta_intersection]") {
72
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
73
+ int value = 0;
74
+ for (int i = 0; i < 1000; i++) sketch1.update(value++);
75
+
76
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
77
+ value = 500;
78
+ for (int i = 0; i < 1000; i++) sketch2.update(value++);
79
+
80
+ theta_intersection intersection;
81
+ intersection.update(sketch1);
82
+ intersection.update(sketch2);
83
+ compact_theta_sketch result = intersection.get_result();
84
+ REQUIRE_FALSE(result.is_empty());
85
+ REQUIRE_FALSE(result.is_estimation_mode());
86
+ REQUIRE(result.get_estimate() == 500.0);
87
+ }
88
+
89
+ TEST_CASE("theta intersection: exact mode half overlap ordered", "[theta_intersection]") {
90
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
91
+ int value = 0;
92
+ for (int i = 0; i < 1000; i++) sketch1.update(value++);
93
+
94
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
95
+ value = 500;
96
+ for (int i = 0; i < 1000; i++) sketch2.update(value++);
97
+
98
+ theta_intersection intersection;
99
+ intersection.update(sketch1.compact());
100
+ intersection.update(sketch2.compact());
101
+ compact_theta_sketch result = intersection.get_result();
102
+ REQUIRE_FALSE(result.is_empty());
103
+ REQUIRE_FALSE(result.is_estimation_mode());
104
+ REQUIRE(result.get_estimate() == 500.0);
105
+ }
106
+
107
+ TEST_CASE("theta intersection: exact mode disjoint unordered", "[theta_intersection]") {
108
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
109
+ int value = 0;
110
+ for (int i = 0; i < 1000; i++) sketch1.update(value++);
111
+
112
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
113
+ for (int i = 0; i < 1000; i++) sketch2.update(value++);
114
+
115
+ theta_intersection intersection;
116
+ intersection.update(sketch1);
117
+ intersection.update(sketch2);
118
+ compact_theta_sketch result = intersection.get_result();
119
+ REQUIRE(result.is_empty());
120
+ REQUIRE_FALSE(result.is_estimation_mode());
121
+ REQUIRE(result.get_estimate() == 0.0);
122
+ }
123
+
124
+ TEST_CASE("theta intersection: exact mode disjoint ordered", "[theta_intersection]") {
125
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
126
+ int value = 0;
127
+ for (int i = 0; i < 1000; i++) sketch1.update(value++);
128
+
129
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
130
+ for (int i = 0; i < 1000; i++) sketch2.update(value++);
131
+
132
+ theta_intersection intersection;
133
+ intersection.update(sketch1.compact());
134
+ intersection.update(sketch2.compact());
135
+ compact_theta_sketch result = intersection.get_result();
136
+ REQUIRE(result.is_empty());
137
+ REQUIRE_FALSE(result.is_estimation_mode());
138
+ REQUIRE(result.get_estimate() == 0.0);
139
+ }
140
+
141
+ TEST_CASE("theta intersection: estimation mode half overlap unordered", "[theta_intersection]") {
142
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
143
+ int value = 0;
144
+ for (int i = 0; i < 10000; i++) sketch1.update(value++);
145
+
146
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
147
+ value = 5000;
148
+ for (int i = 0; i < 10000; i++) sketch2.update(value++);
149
+
150
+ theta_intersection intersection;
151
+ intersection.update(sketch1);
152
+ intersection.update(sketch2);
153
+ compact_theta_sketch result = intersection.get_result();
154
+ REQUIRE_FALSE(result.is_empty());
155
+ REQUIRE(result.is_estimation_mode());
156
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
157
+ }
158
+
159
+ TEST_CASE("theta intersection: estimation mode half overlap ordered", "[theta_intersection]") {
160
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
161
+ int value = 0;
162
+ for (int i = 0; i < 10000; i++) sketch1.update(value++);
163
+
164
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
165
+ value = 5000;
166
+ for (int i = 0; i < 10000; i++) sketch2.update(value++);
167
+
168
+ theta_intersection intersection;
169
+ intersection.update(sketch1.compact());
170
+ intersection.update(sketch2.compact());
171
+ compact_theta_sketch result = intersection.get_result();
172
+ REQUIRE_FALSE(result.is_empty());
173
+ REQUIRE(result.is_estimation_mode());
174
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
175
+ }
176
+
177
+ TEST_CASE("theta intersection: estimation mode disjoint unordered", "[theta_intersection]") {
178
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
179
+ int value = 0;
180
+ for (int i = 0; i < 10000; i++) sketch1.update(value++);
181
+
182
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
183
+ for (int i = 0; i < 10000; i++) sketch2.update(value++);
184
+
185
+ theta_intersection intersection;
186
+ intersection.update(sketch1);
187
+ intersection.update(sketch2);
188
+ compact_theta_sketch result = intersection.get_result();
189
+ REQUIRE_FALSE(result.is_empty());
190
+ REQUIRE(result.is_estimation_mode());
191
+ REQUIRE(result.get_estimate() == 0.0);
192
+ }
193
+
194
+ TEST_CASE("theta intersection: estimation mode disjoint ordered", "[theta_intersection]") {
195
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
196
+ int value = 0;
197
+ for (int i = 0; i < 10000; i++) sketch1.update(value++);
198
+
199
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
200
+ for (int i = 0; i < 10000; i++) sketch2.update(value++);
201
+
202
+ theta_intersection intersection;
203
+ intersection.update(sketch1.compact());
204
+ intersection.update(sketch2.compact());
205
+ compact_theta_sketch result = intersection.get_result();
206
+ REQUIRE_FALSE(result.is_empty());
207
+ REQUIRE(result.is_estimation_mode());
208
+ REQUIRE(result.get_estimate() == 0.0);
209
+ }
210
+
211
+ TEST_CASE("theta intersection: seed mismatch", "[theta_intersection]") {
212
+ update_theta_sketch sketch = update_theta_sketch::builder().build();
213
+ sketch.update(1); // non-empty should not be ignored
214
+ theta_intersection intersection(123);
215
+ REQUIRE_THROWS_AS(intersection.update(sketch), std::invalid_argument);
216
+ }
217
+
218
+ } /* namespace datasketches */
@@ -0,0 +1,438 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+ #include <fstream>
22
+ #include <sstream>
23
+
24
+ #include <theta_sketch.hpp>
25
+
26
+ namespace datasketches {
27
+
28
+ #ifdef TEST_BINARY_INPUT_PATH
29
+ const std::string inputPath = TEST_BINARY_INPUT_PATH;
30
+ #else
31
+ const std::string inputPath = "test/";
32
+ #endif
33
+
34
+ TEST_CASE("theta sketch: empty", "[theta_sketch]") {
35
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
36
+ REQUIRE(update_sketch.is_empty());
37
+ REQUIRE_FALSE(update_sketch.is_estimation_mode());
38
+ REQUIRE(update_sketch.get_theta() == 1.0);
39
+ REQUIRE(update_sketch.get_estimate() == 0.0);
40
+ REQUIRE(update_sketch.get_lower_bound(1) == 0.0);
41
+ REQUIRE(update_sketch.get_upper_bound(1) == 0.0);
42
+
43
+ compact_theta_sketch compact_sketch = update_sketch.compact();
44
+ REQUIRE(compact_sketch.is_empty());
45
+ REQUIRE_FALSE(compact_sketch.is_estimation_mode());
46
+ REQUIRE(compact_sketch.get_theta() == 1.0);
47
+ REQUIRE(compact_sketch.get_estimate() == 0.0);
48
+ REQUIRE(compact_sketch.get_lower_bound(1) == 0.0);
49
+ REQUIRE(compact_sketch.get_upper_bound(1) == 0.0);
50
+ }
51
+
52
+ TEST_CASE("theta sketch: non empty no retained keys", "[theta_sketch]") {
53
+ update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001).build();
54
+ update_sketch.update(1);
55
+ //std::cerr << update_sketch.to_string();
56
+ REQUIRE(update_sketch.get_num_retained() == 0);
57
+ REQUIRE_FALSE(update_sketch.is_empty());
58
+ REQUIRE(update_sketch.is_estimation_mode());
59
+ REQUIRE(update_sketch.get_estimate() == 0.0);
60
+ REQUIRE(update_sketch.get_lower_bound(1) == 0.0);
61
+ REQUIRE(update_sketch.get_upper_bound(1) > 0);
62
+
63
+ compact_theta_sketch compact_sketch = update_sketch.compact();
64
+ REQUIRE(compact_sketch.get_num_retained() == 0);
65
+ REQUIRE_FALSE(compact_sketch.is_empty());
66
+ REQUIRE(compact_sketch.is_estimation_mode());
67
+ REQUIRE(compact_sketch.get_estimate() == 0.0);
68
+ REQUIRE(compact_sketch.get_lower_bound(1) == 0.0);
69
+ REQUIRE(compact_sketch.get_upper_bound(1) > 0);
70
+ }
71
+
72
+ TEST_CASE("theta sketch: single item", "[theta_sketch]") {
73
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
74
+ update_sketch.update(1);
75
+ REQUIRE_FALSE(update_sketch.is_empty());
76
+ REQUIRE_FALSE(update_sketch.is_estimation_mode());
77
+ REQUIRE(update_sketch.get_theta() == 1.0);
78
+ REQUIRE(update_sketch.get_estimate() == 1.0);
79
+ REQUIRE(update_sketch.get_lower_bound(1) == 1.0);
80
+ REQUIRE(update_sketch.get_upper_bound(1) == 1.0);
81
+
82
+ compact_theta_sketch compact_sketch = update_sketch.compact();
83
+ REQUIRE_FALSE(compact_sketch.is_empty());
84
+ REQUIRE_FALSE(compact_sketch.is_estimation_mode());
85
+ REQUIRE(compact_sketch.get_theta() == 1.0);
86
+ REQUIRE(compact_sketch.get_estimate() == 1.0);
87
+ REQUIRE(compact_sketch.get_lower_bound(1) == 1.0);
88
+ REQUIRE(compact_sketch.get_upper_bound(1) == 1.0);
89
+ }
90
+
91
+ TEST_CASE("theta sketch: resize exact", "[theta_sketch]") {
92
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
93
+ for (int i = 0; i < 2000; i++) update_sketch.update(i);
94
+ REQUIRE_FALSE(update_sketch.is_empty());
95
+ REQUIRE_FALSE(update_sketch.is_estimation_mode());
96
+ REQUIRE(update_sketch.get_theta() == 1.0);
97
+ REQUIRE(update_sketch.get_estimate() == 2000.0);
98
+ REQUIRE(update_sketch.get_lower_bound(1) == 2000.0);
99
+ REQUIRE(update_sketch.get_upper_bound(1) == 2000.0);
100
+
101
+ compact_theta_sketch compact_sketch = update_sketch.compact();
102
+ REQUIRE_FALSE(compact_sketch.is_empty());
103
+ REQUIRE_FALSE(compact_sketch.is_estimation_mode());
104
+ REQUIRE(compact_sketch.get_theta() == 1.0);
105
+ REQUIRE(compact_sketch.get_estimate() == 2000.0);
106
+ REQUIRE(compact_sketch.get_lower_bound(1) == 2000.0);
107
+ REQUIRE(compact_sketch.get_upper_bound(1) == 2000.0);
108
+ }
109
+
110
+ TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
111
+ update_theta_sketch update_sketch = update_theta_sketch::builder().set_resize_factor(update_theta_sketch::resize_factor::X1).build();
112
+ const int n = 8000;
113
+ for (int i = 0; i < n; i++) update_sketch.update(i);
114
+ //std::cerr << update_sketch.to_string();
115
+ REQUIRE_FALSE(update_sketch.is_empty());
116
+ REQUIRE(update_sketch.is_estimation_mode());
117
+ REQUIRE(update_sketch.get_theta() < 1.0);
118
+ REQUIRE(update_sketch.get_estimate() == Approx((double) n).margin(n * 0.01));
119
+ REQUIRE(update_sketch.get_lower_bound(1) < n);
120
+ REQUIRE(update_sketch.get_upper_bound(1) > n);
121
+
122
+ const uint32_t k = 1 << update_theta_sketch::builder::DEFAULT_LG_K;
123
+ REQUIRE(update_sketch.get_num_retained() >= k);
124
+ update_sketch.trim();
125
+ REQUIRE(update_sketch.get_num_retained() == k);
126
+
127
+ compact_theta_sketch compact_sketch = update_sketch.compact();
128
+ REQUIRE_FALSE(compact_sketch.is_empty());
129
+ REQUIRE(compact_sketch.is_ordered());
130
+ REQUIRE(compact_sketch.is_estimation_mode());
131
+ REQUIRE(compact_sketch.get_theta() < 1.0);
132
+ REQUIRE(compact_sketch.get_estimate() == Approx((double) n).margin(n * 0.01));
133
+ REQUIRE(compact_sketch.get_lower_bound(1) < n);
134
+ REQUIRE(compact_sketch.get_upper_bound(1) > n);
135
+ }
136
+
137
+ TEST_CASE("theta sketch: deserialize update empty from java as base", "[theta_sketch]") {
138
+ std::ifstream is;
139
+ is.exceptions(std::ios::failbit | std::ios::badbit);
140
+ is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
141
+ auto sketchptr = theta_sketch::deserialize(is);
142
+ REQUIRE(sketchptr->is_empty());
143
+ REQUIRE_FALSE(sketchptr->is_estimation_mode());
144
+ REQUIRE(sketchptr->get_num_retained() == 0);
145
+ REQUIRE(sketchptr->get_theta() == 1.0);
146
+ REQUIRE(sketchptr->get_estimate() == 0.0);
147
+ REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
148
+ REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
149
+ }
150
+
151
+ TEST_CASE("theta sketch: deserialize update empty from java as subclass", "[theta_sketch]") {
152
+ std::ifstream is;
153
+ is.exceptions(std::ios::failbit | std::ios::badbit);
154
+ is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
155
+ auto sketch = update_theta_sketch::deserialize(is);
156
+ REQUIRE(sketch.is_empty());
157
+ REQUIRE_FALSE(sketch.is_estimation_mode());
158
+ REQUIRE(sketch.get_num_retained() == 0);
159
+ REQUIRE(sketch.get_theta() == 1.0);
160
+ REQUIRE(sketch.get_estimate() == 0.0);
161
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
162
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
163
+ }
164
+
165
+ TEST_CASE("theta sketch: deserialize update estimation from java as base", "[theta_sketch]") {
166
+ std::ifstream is;
167
+ is.exceptions(std::ios::failbit | std::ios::badbit);
168
+ is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
169
+ auto sketchptr = theta_sketch::deserialize(is);
170
+ REQUIRE_FALSE(sketchptr->is_empty());
171
+ REQUIRE(sketchptr->is_estimation_mode());
172
+ REQUIRE(sketchptr->get_num_retained() == 5324);
173
+ REQUIRE(sketchptr->get_estimate() == Approx(10000.0).margin(10000 * 0.01));
174
+ REQUIRE(sketchptr->get_lower_bound(1) < 10000);
175
+ REQUIRE(sketchptr->get_upper_bound(1) > 10000);
176
+ }
177
+
178
+ TEST_CASE("theta sketch: deserialize update estimation from java as subclass", "[theta_sketch]") {
179
+ std::ifstream is;
180
+ is.exceptions(std::ios::failbit | std::ios::badbit);
181
+ is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
182
+ auto sketch = update_theta_sketch::deserialize(is);
183
+ REQUIRE_FALSE(sketch.is_empty());
184
+ REQUIRE(sketch.is_estimation_mode());
185
+ REQUIRE(sketch.get_num_retained() == 5324);
186
+ REQUIRE(sketch.get_estimate() == Approx(10000.0).margin(10000 * 0.01));
187
+ REQUIRE(sketch.get_lower_bound(1) < 10000);
188
+ REQUIRE(sketch.get_upper_bound(1) > 10000);
189
+ }
190
+
191
+ TEST_CASE("theta sketch: deserialize compact empty from java as base", "[theta_sketch]") {
192
+ std::ifstream is;
193
+ is.exceptions(std::ios::failbit | std::ios::badbit);
194
+ is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
195
+ auto sketchptr = theta_sketch::deserialize(is);
196
+ REQUIRE(sketchptr->is_empty());
197
+ REQUIRE_FALSE(sketchptr->is_estimation_mode());
198
+ REQUIRE(sketchptr->get_num_retained() == 0);
199
+ REQUIRE(sketchptr->get_theta() == 1.0);
200
+ REQUIRE(sketchptr->get_estimate() == 0.0);
201
+ REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
202
+ REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
203
+ }
204
+
205
+ TEST_CASE("theta sketch: deserialize compact empty from java as subclass", "[theta_sketch]") {
206
+ std::ifstream is;
207
+ is.exceptions(std::ios::failbit | std::ios::badbit);
208
+ is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
209
+ auto sketch = compact_theta_sketch::deserialize(is);
210
+ REQUIRE(sketch.is_empty());
211
+ REQUIRE_FALSE(sketch.is_estimation_mode());
212
+ REQUIRE(sketch.get_num_retained() == 0);
213
+ REQUIRE(sketch.get_theta() == 1.0);
214
+ REQUIRE(sketch.get_estimate() == 0.0);
215
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
216
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
217
+ }
218
+
219
+ TEST_CASE("theta sketch: deserialize single item from java as base", "[theta_sketch]") {
220
+ std::ifstream is;
221
+ is.exceptions(std::ios::failbit | std::ios::badbit);
222
+ is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
223
+ auto sketchptr = theta_sketch::deserialize(is);
224
+ REQUIRE_FALSE(sketchptr->is_empty());
225
+ REQUIRE_FALSE(sketchptr->is_estimation_mode());
226
+ REQUIRE(sketchptr->get_num_retained() == 1);
227
+ REQUIRE(sketchptr->get_theta() == 1.0);
228
+ REQUIRE(sketchptr->get_estimate() == 1.0);
229
+ REQUIRE(sketchptr->get_lower_bound(1) == 1.0);
230
+ REQUIRE(sketchptr->get_upper_bound(1) == 1.0);
231
+ }
232
+
233
+ TEST_CASE("theta sketch: deserialize single item from java as subclass", "[theta_sketch]") {
234
+ std::ifstream is;
235
+ is.exceptions(std::ios::failbit | std::ios::badbit);
236
+ is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
237
+ auto sketch = compact_theta_sketch::deserialize(is);
238
+ REQUIRE_FALSE(sketch.is_empty());
239
+ REQUIRE_FALSE(sketch.is_estimation_mode());
240
+ REQUIRE(sketch.get_num_retained() == 1);
241
+ REQUIRE(sketch.get_theta() == 1.0);
242
+ REQUIRE(sketch.get_estimate() == 1.0);
243
+ REQUIRE(sketch.get_lower_bound(1) == 1.0);
244
+ REQUIRE(sketch.get_upper_bound(1) == 1.0);
245
+ }
246
+
247
+ TEST_CASE("theta sketch: deserialize compact estimation from java as base", "[theta_sketch]") {
248
+ std::ifstream is;
249
+ is.exceptions(std::ios::failbit | std::ios::badbit);
250
+ is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
251
+ auto sketchptr = theta_sketch::deserialize(is);
252
+ REQUIRE_FALSE(sketchptr->is_empty());
253
+ REQUIRE(sketchptr->is_estimation_mode());
254
+ REQUIRE(sketchptr->is_ordered());
255
+ REQUIRE(sketchptr->get_num_retained() == 4342);
256
+ REQUIRE(sketchptr->get_theta() == Approx(0.531700444213199).margin(1e-10));
257
+ REQUIRE(sketchptr->get_estimate() == Approx(8166.25234614053).margin(1e-10));
258
+ REQUIRE(sketchptr->get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
259
+ REQUIRE(sketchptr->get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
260
+
261
+ // the same construction process in Java must have produced exactly the same sketch
262
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
263
+ const int n = 8192;
264
+ for (int i = 0; i < n; i++) update_sketch.update(i);
265
+ REQUIRE(sketchptr->get_num_retained() == update_sketch.get_num_retained());
266
+ REQUIRE(sketchptr->get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
267
+ REQUIRE(sketchptr->get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
268
+ REQUIRE(sketchptr->get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
269
+ REQUIRE(sketchptr->get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
270
+ REQUIRE(sketchptr->get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
271
+ REQUIRE(sketchptr->get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
272
+ REQUIRE(sketchptr->get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
273
+ REQUIRE(sketchptr->get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
274
+ compact_theta_sketch compact_sketch = update_sketch.compact();
275
+ // the sketches are ordered, so the iteration sequence must match exactly
276
+ auto iter = sketchptr->begin();
277
+ for (auto key: compact_sketch) {
278
+ REQUIRE(*iter == key);
279
+ ++iter;
280
+ }
281
+ }
282
+
283
+ TEST_CASE("theta sketch: deserialize compact estimation from java as subclass", "[theta_sketch]") {
284
+ std::ifstream is;
285
+ is.exceptions(std::ios::failbit | std::ios::badbit);
286
+ is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
287
+ auto sketch = compact_theta_sketch::deserialize(is);
288
+ REQUIRE_FALSE(sketch.is_empty());
289
+ REQUIRE(sketch.is_estimation_mode());
290
+ REQUIRE(sketch.get_num_retained() == 4342);
291
+ REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
292
+ REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
293
+ REQUIRE(sketch.get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
294
+ REQUIRE(sketch.get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
295
+
296
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
297
+ const int n = 8192;
298
+ for (int i = 0; i < n; i++) update_sketch.update(i);
299
+ REQUIRE(sketch.get_num_retained() == update_sketch.get_num_retained());
300
+ REQUIRE(sketch.get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
301
+ REQUIRE(sketch.get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
302
+ REQUIRE(sketch.get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
303
+ REQUIRE(sketch.get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
304
+ REQUIRE(sketch.get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
305
+ REQUIRE(sketch.get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
306
+ REQUIRE(sketch.get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
307
+ REQUIRE(sketch.get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
308
+ }
309
+
310
+ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalency", "[theta_sketch]") {
311
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
312
+ const int n = 8192;
313
+ for (int i = 0; i < n; i++) update_sketch.update(i);
314
+
315
+ // update sketch stream and bytes comparison
316
+ {
317
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
318
+ update_sketch.serialize(s);
319
+ auto bytes = update_sketch.serialize();
320
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
321
+ for (size_t i = 0; i < bytes.size(); ++i) {
322
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
323
+ }
324
+
325
+ // deserialize as base class
326
+ {
327
+ s.seekg(0); // rewind
328
+ auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
329
+ auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
330
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
331
+ REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
332
+ REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
333
+ REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
334
+ REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
335
+ REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
336
+ REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
337
+ REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
338
+ // hash tables must be identical since they are restored from dumps, and iteration is deterministic
339
+ auto iter = deserialized_sketch_ptr1->begin();
340
+ for (auto key: *deserialized_sketch_ptr2) {
341
+ REQUIRE(*iter == key);
342
+ ++iter;
343
+ }
344
+ }
345
+
346
+ // deserialize as subclass
347
+ {
348
+ s.seekg(0); // rewind
349
+ update_theta_sketch deserialized_sketch1 = update_theta_sketch::deserialize(s);
350
+ update_theta_sketch deserialized_sketch2 = update_theta_sketch::deserialize(bytes.data(), bytes.size());
351
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
352
+ REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
353
+ REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
354
+ REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
355
+ REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
356
+ REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
357
+ REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
358
+ REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
359
+ // hash tables must be identical since they are restored from dumps, and iteration is deterministic
360
+ auto iter = deserialized_sketch1.begin();
361
+ for (auto key: deserialized_sketch2) {
362
+ REQUIRE(*iter == key);
363
+ ++iter;
364
+ }
365
+ }
366
+ }
367
+
368
+ // compact sketch stream and bytes comparison
369
+ {
370
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
371
+ update_sketch.compact().serialize(s);
372
+ auto bytes = update_sketch.compact().serialize();
373
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
374
+ for (size_t i = 0; i < bytes.size(); ++i) {
375
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
376
+ }
377
+
378
+ // deserialize as base class
379
+ {
380
+ s.seekg(0); // rewind
381
+ auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
382
+ auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
383
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
384
+ REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
385
+ REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
386
+ REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
387
+ REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
388
+ REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
389
+ REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
390
+ REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
391
+ // the sketches are ordered, so the iteration sequence must match exactly
392
+ auto iter = deserialized_sketch_ptr1->begin();
393
+ for (auto key: *deserialized_sketch_ptr2) {
394
+ REQUIRE(*iter == key);
395
+ ++iter;
396
+ }
397
+ }
398
+
399
+ // deserialize as subclass
400
+ {
401
+ s.seekg(0); // rewind
402
+ compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
403
+ compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
404
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
405
+ REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
406
+ REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
407
+ REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
408
+ REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
409
+ REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
410
+ REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
411
+ REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
412
+ // the sketches are ordered, so the iteration sequence must match exactly
413
+ auto iter = deserialized_sketch1.begin();
414
+ for (auto key: deserialized_sketch2) {
415
+ REQUIRE(*iter == key);
416
+ ++iter;
417
+ }
418
+ }
419
+ }
420
+ }
421
+
422
+ TEST_CASE("theta sketch: deserialize update single item buffer overrun", "[theta_sketch]") {
423
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
424
+ update_sketch.update(1);
425
+ theta_sketch::vector_bytes bytes = update_sketch.serialize();
426
+ REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
427
+ REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
428
+ }
429
+
430
+ TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") {
431
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
432
+ update_sketch.update(1);
433
+ theta_sketch::vector_bytes bytes = update_sketch.compact().serialize();
434
+ REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
435
+ REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
436
+ }
437
+
438
+ } /* namespace datasketches */