datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,358 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <var_opt_union.hpp>
21
+ #include "test_type.hpp"
22
+
23
+ #include <catch.hpp>
24
+
25
+ #include <vector>
26
+ #include <string>
27
+ #include <sstream>
28
+ #include <fstream>
29
+ #include <cmath>
30
+ #include <random>
31
+
32
+ #ifdef TEST_BINARY_INPUT_PATH
33
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
34
+ #else
35
+ static std::string testBinaryInputPath = "test/";
36
+ #endif
37
+
38
+ namespace datasketches {
39
+
40
+ static constexpr double EPS = 1e-13;
41
+
42
+ static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
43
+ var_opt_sketch<int> sk(k);
44
+ for (uint64_t i = 0; i < n; ++i) {
45
+ sk.update(i, 1.0);
46
+ }
47
+ return sk;
48
+ }
49
+
50
+ // if exact_compare = false, checks for equivalence -- specific R region values may differ but
51
+ // R region weights must match
52
+ template<typename T, typename S, typename A>
53
+ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk2, bool exact_compare = true) {
54
+ REQUIRE(sk1.get_k() == sk2.get_k());
55
+ REQUIRE(sk1.get_n() == sk2.get_n());
56
+ REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
57
+
58
+ auto it1 = sk1.begin();
59
+ auto it2 = sk2.begin();
60
+ size_t i = 0;
61
+
62
+ while ((it1 != sk1.end()) && (it2 != sk2.end())) {
63
+ const std::pair<const T&, const double> p1 = *it1;
64
+ const std::pair<const T&, const double> p2 = *it2;
65
+ if (exact_compare) {
66
+ REQUIRE(p1.first == p2.first); // data values
67
+ }
68
+ REQUIRE(p1.second == p2.second); // weight values
69
+ ++i;
70
+ ++it1;
71
+ ++it2;
72
+ }
73
+
74
+ REQUIRE((it1 == sk1.end() && it2 == sk2.end())); // iterators must end at the same time
75
+ }
76
+
77
+ // compare serialization and deserialization results, checking string and stream methods to
78
+ // ensure that the resulting binary images are compatible.
79
+ // if exact_compare = false, checks for equivalence -- specific R region values may differ but
80
+ // R region weights must match
81
+ template<typename T, typename S, typename A>
82
+ static void compare_serialization_deserialization(var_opt_union<T,S,A>& vo_union, bool exact_compare = true) {
83
+ std::vector<uint8_t> bytes = vo_union.serialize();
84
+
85
+ var_opt_union<T> u_from_bytes = var_opt_union<T>::deserialize(bytes.data(), bytes.size());
86
+ var_opt_sketch<T> sk1 = vo_union.get_result();
87
+ var_opt_sketch<T> sk2 = u_from_bytes.get_result();
88
+ check_if_equal(sk1, sk2, exact_compare);
89
+
90
+ std::string str(bytes.begin(), bytes.end());
91
+ std::stringstream ss;
92
+ ss.str(str);
93
+
94
+ var_opt_union<T> u_from_stream = var_opt_union<T>::deserialize(ss);
95
+ sk2 = u_from_stream.get_result();
96
+ check_if_equal(sk1, sk2, exact_compare);
97
+
98
+ ss.seekg(0); // didn't put anything so only reset read position
99
+ vo_union.serialize(ss);
100
+ u_from_stream = var_opt_union<T>::deserialize(ss);
101
+ sk2 = u_from_stream.get_result();
102
+ check_if_equal(sk1, sk2, exact_compare);
103
+
104
+ std::string str_from_stream = ss.str();
105
+ var_opt_union<T> u_from_str = var_opt_union<T>::deserialize(str_from_stream.c_str(), str_from_stream.size());
106
+ sk2 = u_from_str.get_result();
107
+ check_if_equal(sk1, sk2, exact_compare);
108
+
109
+ // check truncated input, too
110
+ REQUIRE_THROWS_AS(var_opt_union<T>::deserialize(bytes.data(), bytes.size() - 5), std::out_of_range);
111
+ std::string str_trunc((char*)&bytes[0], bytes.size() - 5);
112
+ ss.str(str_trunc);
113
+ // next line may throw either std::illegal_argument or std::runtime_exception
114
+ REQUIRE_THROWS_AS(var_opt_union<T>::deserialize(ss), std::exception);
115
+ }
116
+
117
+ TEST_CASE("varopt union: bad prelongs", "[var_opt_union]") {
118
+ var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
119
+ var_opt_union<int> u(32);
120
+ u.update(sk);
121
+ std::vector<uint8_t> bytes = u.serialize();
122
+
123
+ bytes[0] = 0; // corrupt the preamble longs byte to be too small
124
+ REQUIRE_THROWS_AS(var_opt_union<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
125
+
126
+ // create a stringstream to check the same
127
+ std::stringstream ss;
128
+ std::string str(bytes.begin(), bytes.end());
129
+ ss.str(str);
130
+ REQUIRE_THROWS_AS(var_opt_union<int>::deserialize(ss), std::invalid_argument);
131
+ }
132
+
133
+ TEST_CASE("varopt union: bad serialization version", "[var_opt_union]") {
134
+ var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
135
+ var_opt_union<int> u(32);
136
+ u.update(sk);
137
+ std::vector<uint8_t> bytes = u.serialize();
138
+ bytes[1] = 0; // corrupt the serialization version byte
139
+
140
+ REQUIRE_THROWS_AS(var_opt_union<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
141
+
142
+ // create a stringstream to check the same
143
+ std::stringstream ss;
144
+ std::string str(bytes.begin(), bytes.end());
145
+ ss.str(str);
146
+ REQUIRE_THROWS_AS(var_opt_union<int>::deserialize(ss), std::invalid_argument);
147
+ }
148
+
149
+ TEST_CASE("varopt union: invalid k", "[var_opt_union]") {
150
+ REQUIRE_THROWS_AS(var_opt_union<int>(0), std::invalid_argument);
151
+ REQUIRE_THROWS_AS(var_opt_union<int>(1<<31), std::invalid_argument);
152
+ }
153
+
154
+ TEST_CASE("varopt union: bad family", "[var_opt_union]") {
155
+ var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
156
+ var_opt_union<int> u(15);
157
+ u.update(sk);
158
+ std::vector<uint8_t> bytes = u.serialize();
159
+ bytes[2] = 0; // corrupt the family byte
160
+
161
+ REQUIRE_THROWS_AS(var_opt_union<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
162
+
163
+ std::stringstream ss;
164
+ std::string str(bytes.begin(), bytes.end());
165
+ ss.str(str);
166
+ REQUIRE_THROWS_AS(var_opt_union<int>::deserialize(ss), std::invalid_argument);
167
+ }
168
+
169
+ TEST_CASE("varopt union: empty union", "[var_opt_union]") {
170
+ uint32_t k = 2048;
171
+ var_opt_sketch<std::string> sk(k);
172
+ var_opt_union<std::string> u(k);
173
+ u.update(sk);
174
+
175
+ var_opt_sketch<std::string> result = u.get_result();
176
+ REQUIRE(result.is_empty());
177
+ REQUIRE(result.get_n() == 0);
178
+ REQUIRE(result.get_num_samples() == 0);
179
+ REQUIRE(result.get_k() == k);
180
+ }
181
+
182
+ TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
183
+ uint64_t n = 4; // 2n < k
184
+ uint32_t k = 10;
185
+ var_opt_sketch<int> sk1(k), sk2(k);
186
+
187
+ for (uint64_t i = 1; i <= n; ++i) {
188
+ sk1.update(i, i);
189
+ sk2.update(static_cast<int64_t>(-i), i);
190
+ }
191
+
192
+ var_opt_union<int> u(k);
193
+ u.update(sk1);
194
+ u.update(sk2);
195
+
196
+ var_opt_sketch<int> result = u.get_result();
197
+ REQUIRE(result.get_n() == 2 * n);
198
+ REQUIRE(result.get_k() == k);
199
+ }
200
+
201
+ TEST_CASE("varopt union: heavy sampling sketch", "[var_opt_union]") {
202
+ uint64_t n1 = 20;
203
+ uint32_t k1 = 10;
204
+ uint64_t n2 = 6;
205
+ uint32_t k2 = 5;
206
+ var_opt_sketch<int64_t> sk1(k1), sk2(k2);
207
+ for (uint64_t i = 1; i <= n1; ++i) {
208
+ sk1.update(i, i);
209
+ }
210
+
211
+ for (uint64_t i = 1; i < n2; ++i) { // we'll add a very heavy one later
212
+ sk2.update(static_cast<int64_t>(-i), i + 1000.0);
213
+ }
214
+ sk2.update(-n2, 1000000.0);
215
+
216
+ var_opt_union<int64_t> u(k1);
217
+ u.update(sk1);
218
+ u.update(sk2);
219
+
220
+ var_opt_sketch<int64_t> result = u.get_result();
221
+ REQUIRE(result.get_n() == n1 + n2);
222
+ REQUIRE(result.get_k() == k2); // heavy enough the result pulls back to k2
223
+
224
+ u.reset();
225
+ result = u.get_result();
226
+ REQUIRE(result.get_n() == 0);
227
+ REQUIRE(result.get_k() == k1); // union reset so empty result reflects max_k
228
+ }
229
+
230
+ TEST_CASE("varopt union: identical sampling sketches", "[var_opt_union]") {
231
+ uint32_t k = 20;
232
+ uint64_t n = 50;
233
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, n);
234
+
235
+ var_opt_union<int> u(k);
236
+ u.update(sk);
237
+ u.update(sk);
238
+
239
+ var_opt_sketch<int> result = u.get_result();
240
+ double expected_wt = 2.0 * n;
241
+ subset_summary ss = result.estimate_subset_sum([](int){return true;});
242
+ REQUIRE(result.get_n() == 2 * n);
243
+ REQUIRE(ss.total_sketch_weight == Approx(expected_wt).margin(EPS));
244
+
245
+ // add another sketch, such that sketch_tau < outer_tau
246
+ sk = create_unweighted_sketch(k, k + 1); // tau = (k + 1) / k
247
+ u.update(sk);
248
+ result = u.get_result();
249
+ expected_wt = (2.0 * n) + k + 1;
250
+ ss = result.estimate_subset_sum([](int){return true;});
251
+ REQUIRE(result.get_n() == (2 * n) + k + 1);
252
+ REQUIRE(ss.total_sketch_weight == Approx(expected_wt).margin(EPS));
253
+ }
254
+
255
+ TEST_CASE("varopt union: small sampling sketch", "[var_opt_union]") {
256
+ uint32_t k_small = 16;
257
+ uint32_t k_max = 128;
258
+ uint64_t n1 = 32;
259
+ uint64_t n2 = 64;
260
+
261
+ var_opt_sketch<float> sk(k_small);
262
+ for (uint64_t i = 0; i < n1; ++i) { sk.update(i); }
263
+ sk.update(-1, n1 * n1); // add a heavy item
264
+
265
+ var_opt_union<float> u(k_max);
266
+ u.update(sk);
267
+
268
+ // another one, but different n to get a different per-item weight
269
+ var_opt_sketch<float> sk2(k_small);
270
+ for (uint64_t i = 0; i < n2; ++i) { sk2.update(i); }
271
+ u.update(sk2);
272
+
273
+ // should trigger migrate_marked_items_by_decreasing_k()
274
+ var_opt_sketch<float> result = u.get_result();
275
+ REQUIRE(result.get_n() == n1 + n2 + 1);
276
+
277
+ double expected_wt = 1.0 * (n1 + n2); // n1 + n2 light items, ignore the heavy one
278
+ subset_summary ss = result.estimate_subset_sum([](float x){return x >= 0;});
279
+ REQUIRE(ss.estimate == Approx(expected_wt).margin(EPS));
280
+ REQUIRE(ss.total_sketch_weight == Approx(expected_wt + (n1 * n1)).margin(EPS));
281
+ REQUIRE(result.get_k() < k_max);
282
+
283
+ // check that mark information is preserved as expected
284
+ compare_serialization_deserialization(u, false);
285
+ }
286
+
287
+ TEST_CASE("varopt union: serialize empty", "[var_opt_union]") {
288
+ var_opt_union<std::string> u(100);
289
+ compare_serialization_deserialization(u);
290
+ }
291
+
292
+ TEST_CASE("varopt union: serialize exact", "[var_opt_union]") {
293
+ uint32_t k = 100;
294
+ var_opt_union<int> u(k);
295
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, k / 2);
296
+ u.update(sk);
297
+
298
+ compare_serialization_deserialization(u);
299
+ }
300
+
301
+ TEST_CASE("varopt union: serialize sampling", "[var_opt_union]") {
302
+ uint32_t k = 100;
303
+ var_opt_union<int> u(k);
304
+ var_opt_sketch<int> sk = create_unweighted_sketch(k, 2 * k);
305
+ u.update(sk);
306
+
307
+ compare_serialization_deserialization(u);
308
+ }
309
+
310
+ TEST_CASE("varopt union: deserialize from java", "[var_opt_union]") {
311
+ std::ifstream is;
312
+ is.exceptions(std::ios::failbit | std::ios::badbit);
313
+ is.open(testBinaryInputPath + "varopt_union_double_sampling.sk", std::ios::binary);
314
+ var_opt_union<double> u = var_opt_union<double>::deserialize(is);
315
+
316
+ // must reduce k in the process, like in small_sampling_sketch()
317
+ var_opt_sketch<double> result = u.get_result();
318
+ REQUIRE_FALSE(result.is_empty());
319
+ REQUIRE(result.get_n() == 97);
320
+
321
+ double expected_wt = 96.0;// light items -- ignoring the heavy one
322
+ subset_summary ss = result.estimate_subset_sum([](double x){return x >= 0;});
323
+ REQUIRE(ss.estimate == Approx(expected_wt).margin(EPS));
324
+ REQUIRE(ss.total_sketch_weight == Approx(expected_wt + 1024.0).margin(EPS));
325
+ REQUIRE(result.get_k() < 128);
326
+ }
327
+
328
+ TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
329
+ uint32_t n = 20;
330
+ uint32_t k = 5;
331
+ var_opt_union<test_type> u(k);
332
+ var_opt_sketch<test_type> sk1(k);
333
+ var_opt_sketch<test_type> sk2(k);
334
+
335
+ // move udpates
336
+ for (int i = 0; i < (int) n; ++i) {
337
+ sk1.update(i);
338
+ sk2.update(-i);
339
+ }
340
+ REQUIRE(sk1.get_n() == n);
341
+ REQUIRE(sk2.get_n() == n);
342
+
343
+ // move unions
344
+ u.update(std::move(sk2));
345
+ u.update(std::move(sk1));
346
+ REQUIRE(u.get_result().get_n() == 2 * n);
347
+
348
+ // move constructor
349
+ var_opt_union<test_type> u2(std::move(u));
350
+ REQUIRE(u2.get_result().get_n() == 2 * n);
351
+
352
+ // move assignment
353
+ var_opt_union<test_type> u3(k);
354
+ u3 = std::move(u2);
355
+ REQUIRE(u3.get_result().get_n() == 2 * n);
356
+ }
357
+
358
+ }
@@ -0,0 +1,94 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ # Modified from:
19
+ # http://www.benjack.io/2018/02/02/python-cpp-revisited.html
20
+
21
+ import os
22
+ import sys
23
+ import sysconfig
24
+ import platform
25
+ import subprocess
26
+
27
+ from setuptools import setup, find_packages, Extension
28
+ from setuptools.command.build_ext import build_ext
29
+
30
+ class CMakeExtension(Extension):
31
+ def __init__(self, name, sourcedir=''):
32
+ Extension.__init__(self, name, sources=[])
33
+ self.sourcedir = os.path.abspath(sourcedir)
34
+
35
+ class CMakeBuild(build_ext):
36
+ def run(self):
37
+ try:
38
+ subprocess.check_output(['cmake', '--version'])
39
+ except OSError:
40
+ raise RuntimeError(
41
+ "CMake >= 3.12 must be installed to build the following extensions: " +
42
+ ", ".join(e.name for e in self.extensions))
43
+
44
+ for ext in self.extensions:
45
+ self.build_extension(ext)
46
+
47
+ def build_extension(self, ext):
48
+ extdir = os.path.abspath(
49
+ os.path.dirname(self.get_ext_fullpath(ext.name)))
50
+ cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir]
51
+ cmake_args += ['-DWITH_PYTHON=True']
52
+ cfg = 'Debug' if self.debug else 'Release'
53
+ build_args = ['--config', cfg]
54
+
55
+ if platform.system() == "Windows":
56
+ cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(
57
+ cfg.upper(),
58
+ extdir)]
59
+ if sys.maxsize > 2**32:
60
+ cmake_args += ['-A', 'x64']
61
+ build_args += ['--', '/m']
62
+ else:
63
+ cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
64
+ build_args += ['--', '-j2']
65
+
66
+ env = os.environ.copy()
67
+ env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(
68
+ env.get('CXXFLAGS', ''),
69
+ self.distribution.get_version())
70
+ if not os.path.exists(self.build_temp):
71
+ os.makedirs(self.build_temp)
72
+ subprocess.check_call(['cmake', ext.sourcedir] + cmake_args,
73
+ cwd=self.build_temp, env=env)
74
+ subprocess.check_call(['cmake', '--build', '.', '--target', 'python'] + build_args,
75
+ cwd=self.build_temp)
76
+ print() # add an empty line to pretty print
77
+
78
+ setup(
79
+ name='datasketches',
80
+ version='2.2.0-SNAPSHOT',
81
+ author='Datasketches Developers',
82
+ author_email='dev@datasketches.apache.org',
83
+ description='A wrapper for the C++ Datasketches library',
84
+ license='Apache License 2.0',
85
+ url='http://datasketches.apache.org',
86
+ long_description=open('python/README.md').read(),
87
+ packages=find_packages('python'), # python pacakges only in this dir
88
+ package_dir={'':'python'},
89
+ # may need to add all source paths for sdist packages w/o MANIFEST.in
90
+ ext_modules=[CMakeExtension('datasketches')],
91
+ cmdclass={'build_ext': CMakeBuild},
92
+ setup_requires=['setuptools_scm','tox-setuptools'],
93
+ zip_safe=False
94
+ )