datasketches 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,229 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <kll_sketch.hpp>
23
+ #include <kll_helper.hpp>
24
+
25
+ #include <assert.h>
26
+
27
+ #ifdef KLL_VALIDATION
28
+
29
+ // This is to make sure the implementation matches exactly the reference implementation in OCaml.
30
+ // Conditional compilation is used because the implementation needs a few modifications:
31
+ // - switch from random choice to deterministic
32
+ // - a few methods to expose internals of the sketch
33
+
34
+ namespace datasketches {
35
+
36
+ uint32_t kll_next_offset; // to make kll_sketch deterministic
37
+
38
+ constexpr unsigned num_tests = 114;
39
+
40
+ const int64_t correct_results[num_tests * 7] = {
41
+ 0, 200, 180, 3246533, 1, 180, 1098352976109474698,
42
+ 1, 200, 198, 8349603, 1, 198, 686681527497651888,
43
+ 2, 200, 217, 676491, 2, 117, 495856134049157644,
44
+ 3, 200, 238, 3204507, 2, 138, 44453438498725402,
45
+ 4, 200, 261, 2459373, 2, 161, 719830627391926938,
46
+ 5, 200, 287, 5902143, 2, 187, 389303173170515580,
47
+ 6, 200, 315, 5188793, 2, 215, 985218890825795000,
48
+ 7, 200, 346, 801923, 2, 246, 589362992166904413,
49
+ 8, 200, 380, 2466269, 2, 280, 1081848693781775853,
50
+ 9, 200, 418, 5968041, 2, 318, 533825689515788397,
51
+ 10, 200, 459, 3230027, 2, 243, 937332670315558786,
52
+ 11, 200, 504, 5125875, 2, 288, 1019197831515566845,
53
+ 12, 200, 554, 4195571, 3, 230, 797351479150148224,
54
+ 13, 200, 609, 2221181, 3, 285, 451246040374318529,
55
+ 14, 200, 669, 5865503, 3, 345, 253851269470815909,
56
+ 15, 200, 735, 831703, 3, 411, 491974970526372303,
57
+ 16, 200, 808, 4830785, 3, 327, 1032107507126916277,
58
+ 17, 200, 888, 1356257, 3, 407, 215225420986342944,
59
+ 18, 200, 976, 952071, 3, 417, 600280049738270697,
60
+ 19, 200, 1073, 6729833, 3, 397, 341758522977365969,
61
+ 20, 200, 1180, 6017925, 3, 406, 1080227312339182949,
62
+ 21, 200, 1298, 4229891, 3, 401, 1092460534756675086,
63
+ 22, 200, 1427, 7264889, 4, 320, 884533400696890024,
64
+ 23, 200, 1569, 5836327, 4, 462, 660575800011134382,
65
+ 24, 200, 1725, 5950087, 4, 416, 669373957401387528,
66
+ 25, 200, 1897, 2692555, 4, 406, 607308667566496888,
67
+ 26, 200, 2086, 1512443, 4, 459, 744260340112029032,
68
+ 27, 200, 2294, 2681171, 4, 434, 199120609113802485,
69
+ 28, 200, 2523, 3726521, 4, 450, 570993497599288304,
70
+ 29, 200, 2775, 2695247, 4, 442, 306717093329516310,
71
+ 30, 200, 3052, 5751175, 5, 400, 256024589545754217,
72
+ 31, 200, 3357, 1148897, 5, 514, 507276662329207479,
73
+ 32, 200, 3692, 484127, 5, 457, 1082660223488175122,
74
+ 33, 200, 4061, 6414559, 5, 451, 620820308918522117,
75
+ 34, 200, 4467, 5587461, 5, 466, 121975084804459305,
76
+ 35, 200, 4913, 1615017, 5, 483, 152986529342916376,
77
+ 36, 200, 5404, 6508535, 5, 492, 858526451332425960,
78
+ 37, 200, 5944, 2991657, 5, 492, 624906434274621995,
79
+ 38, 200, 6538, 6736565, 6, 511, 589153542019036049,
80
+ 39, 200, 7191, 1579893, 6, 507, 10255312374117907,
81
+ 40, 200, 7910, 412509, 6, 538, 570863587164194186,
82
+ 41, 200, 8701, 1112089, 6, 477, 553100668286355347,
83
+ 42, 200, 9571, 1258813, 6, 526, 344845406406036297,
84
+ 43, 200, 10528, 1980049, 6, 508, 411846569527905064,
85
+ 44, 200, 11580, 2167127, 6, 520, 966876726203675488,
86
+ 45, 200, 12738, 1975435, 7, 561, 724125506920592732,
87
+ 46, 200, 14011, 4289627, 7, 560, 753686005174215572,
88
+ 47, 200, 15412, 5384001, 7, 494, 551637841878573955,
89
+ 48, 200, 16953, 2902685, 7, 560, 94602851752354802,
90
+ 49, 200, 18648, 4806445, 7, 562, 597672400688514221,
91
+ 50, 200, 20512, 2085, 7, 529, 417280161591969960,
92
+ 51, 200, 22563, 6375939, 7, 558, 11300453985206678,
93
+ 52, 200, 24819, 7837057, 7, 559, 283668599967437754,
94
+ 53, 200, 27300, 6607975, 8, 561, 122183647493325363,
95
+ 54, 200, 30030, 1519191, 8, 550, 1145227891427321202,
96
+ 55, 200, 33033, 808061, 8, 568, 71070843834364939,
97
+ 56, 200, 36336, 2653529, 8, 570, 450311772805359006,
98
+ 57, 200, 39969, 2188957, 8, 561, 269670427054904115,
99
+ 58, 200, 43965, 5885655, 8, 539, 1039064186324091890,
100
+ 59, 200, 48361, 6185889, 8, 574, 178055275082387938,
101
+ 60, 200, 53197, 208767, 9, 579, 139766040442973048,
102
+ 61, 200, 58516, 2551345, 9, 569, 322655279254252950,
103
+ 62, 200, 64367, 1950873, 9, 569, 101542216315768285,
104
+ 63, 200, 70803, 2950429, 9, 582, 72294008568551853,
105
+ 64, 200, 77883, 3993977, 9, 572, 299014330559512530,
106
+ 65, 200, 85671, 428871, 9, 585, 491351721800568188,
107
+ 66, 200, 94238, 6740849, 9, 577, 656204268858348899,
108
+ 67, 200, 103661, 2315497, 9, 562, 829926273188300764,
109
+ 68, 200, 114027, 5212835, 10, 581, 542222554617639557,
110
+ 69, 200, 125429, 4213475, 10, 593, 713339189579860773,
111
+ 70, 200, 137971, 2411583, 10, 592, 649651658985845357,
112
+ 71, 200, 151768, 5243307, 10, 567, 1017459402785275179,
113
+ 72, 200, 166944, 2468367, 10, 593, 115034451827634398,
114
+ 73, 200, 183638, 2210923, 10, 583, 365735165000548572,
115
+ 74, 200, 202001, 321257, 10, 591, 928479940794929153,
116
+ 75, 200, 222201, 8185105, 11, 600, 780163958693677795,
117
+ 76, 200, 244421, 6205349, 11, 598, 132454307780236135,
118
+ 77, 200, 268863, 3165901, 11, 600, 369824066179493948,
119
+ 78, 200, 295749, 2831723, 11, 595, 80968411797441666,
120
+ 79, 200, 325323, 464193, 11, 594, 125773061716381917,
121
+ 80, 200, 357855, 7499035, 11, 576, 994150328579932916,
122
+ 81, 200, 393640, 1514479, 11, 596, 111092193875842594,
123
+ 82, 200, 433004, 668493, 12, 607, 497338041653302784,
124
+ 83, 200, 476304, 3174931, 12, 606, 845986926165673887,
125
+ 84, 200, 523934, 914611, 12, 605, 354993119685278556,
126
+ 85, 200, 576327, 7270385, 12, 602, 937679531753465428,
127
+ 86, 200, 633959, 1956979, 12, 598, 659413123921208266,
128
+ 87, 200, 697354, 3137635, 12, 606, 874228711599628459,
129
+ 88, 200, 767089, 214923, 12, 608, 1077644643342432307,
130
+ 89, 200, 843797, 3084545, 13, 612, 79317113064339979,
131
+ 90, 200, 928176, 7800899, 13, 612, 357414065779796772,
132
+ 91, 200, 1020993, 6717253, 13, 615, 532723577905833296,
133
+ 92, 200, 1123092, 5543015, 13, 614, 508695073250223746,
134
+ 93, 200, 1235401, 298785, 13, 616, 34344606952783179,
135
+ 94, 200, 1358941, 4530313, 13, 607, 169924026179364121,
136
+ 95, 200, 1494835, 4406457, 13, 612, 1026773494313671061,
137
+ 96, 200, 1644318, 1540983, 13, 614, 423454640036650614,
138
+ 97, 200, 1808749, 7999631, 14, 624, 466122870338520329,
139
+ 98, 200, 1989623, 4295537, 14, 621, 609309853701283445,
140
+ 99, 200, 2188585, 7379971, 14, 622, 141739898871015642,
141
+ 100, 200, 2407443, 6188931, 14, 621, 22515080776738923,
142
+ 101, 200, 2648187, 6701239, 14, 619, 257441864177795548,
143
+ 102, 200, 2913005, 2238709, 14, 623, 867028825821064773,
144
+ 103, 200, 3204305, 5371075, 14, 625, 1110615471273395112,
145
+ 104, 200, 3524735, 7017341, 15, 631, 619518037415974467,
146
+ 105, 200, 3877208, 323337, 15, 633, 513230912593541122,
147
+ 106, 200, 4264928, 6172471, 15, 628, 885861662583325072,
148
+ 107, 200, 4691420, 5653803, 15, 633, 754052473303005204,
149
+ 108, 200, 5160562, 1385265, 15, 630, 294993765757975100,
150
+ 109, 200, 5676618, 4350899, 15, 617, 1073144684944932303,
151
+ 110, 200, 6244279, 1272235, 15, 630, 308982934296855020,
152
+ 111, 200, 6868706, 1763939, 16, 638, 356231694823272867,
153
+ 112, 200, 7555576, 3703411, 16, 636, 20043268926300101,
154
+ 113, 200, 8311133, 6554171, 16, 637, 121111429906734123
155
+ };
156
+
157
+ static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
158
+ assert (kll_helper::is_odd(stride));
159
+ unsigned mask((1 << 23) - 1); // because library items are single-precision floats at the moment
160
+ unsigned cur(0);
161
+ std::unique_ptr<int[]> arr(new int[n]);
162
+ for (unsigned i = 0; i < n; i++) {
163
+ cur += stride;
164
+ cur &= mask;
165
+ arr[i] = cur;
166
+ }
167
+ return arr;
168
+ }
169
+
170
+ static int64_t simple_hash_of_sub_array(const float* arr, unsigned start, unsigned length) {
171
+ int64_t multiplier(738219921); // an arbitrary odd 30-bit number
172
+ int64_t mask60((1ULL << 60) - 1ULL);
173
+ int64_t accum(0);
174
+ for (unsigned i = start; i < start + length; i++) {
175
+ accum += (int64_t) arr[i];
176
+ accum *= multiplier;
177
+ accum &= mask60;
178
+ accum ^= accum >> 30;
179
+ }
180
+ return accum;
181
+ }
182
+
183
+ TEST_CASE("kll validation", "[kll_sketch][validation]") {
184
+ for (unsigned i = 0; i < num_tests; i++) {
185
+ assert (correct_results[7 * i] == i);
186
+ unsigned k(correct_results[7 * i + 1]);
187
+ unsigned n(correct_results[7 * i + 2]);
188
+ unsigned stride(correct_results[7 * i + 3]);
189
+ std::unique_ptr<int[]> input_array = make_input_array(n, stride);
190
+ kll_sketch<float> sketch(k);
191
+ kll_next_offset = 0;
192
+ for (unsigned j = 0; j < n; j++) {
193
+ sketch.update(input_array[j]);
194
+ }
195
+ unsigned num_levels = sketch.get_num_levels();
196
+ unsigned num_samples = sketch.get_num_retained();
197
+ int64_t hashed_samples = simple_hash_of_sub_array(sketch.get_items(), sketch.get_levels()[0], num_samples);
198
+ std::cout << i;
199
+ REQUIRE(correct_results[7 * i + 4] == num_levels);
200
+ REQUIRE(correct_results[7 * i + 5] == num_samples);
201
+ if (correct_results[7 * i + 6] == hashed_samples) {
202
+ std::cout << " pass" << std::endl;
203
+ } else {
204
+ std::cout << " " << (correct_results[7 * i + 6]) << " != " << hashed_samples;
205
+ sketch.to_stream(std::cout);
206
+ FAIL();
207
+ }
208
+ }
209
+ }
210
+
211
+ TEST_CASE("kll validation: test hash", "[kll_sketch][validaiton]") {
212
+ float array[] = { 907500, 944104, 807020, 219921, 678370, 955217, 426885 };
213
+ REQUIRE(simple_hash_of_sub_array(array, 1, 5) == 1141543353991880193LL);
214
+ }
215
+
216
+ TEST_CASE("kll validation: make input array", "[kll_sketch][validaiton]") {
217
+ int expected_array[6] = { 3654721, 7309442, 2575555, 6230276, 1496389, 5151110 };
218
+ auto array(make_input_array(6, 3654721));
219
+ REQUIRE(array[0] == expected_array[0]);
220
+ REQUIRE(array[1] == expected_array[1]);
221
+ REQUIRE(array[2] == expected_array[2]);
222
+ REQUIRE(array[3] == expected_array[3]);
223
+ REQUIRE(array[4] == expected_array[4]);
224
+ REQUIRE(array[5] == expected_array[5]);
225
+ }
226
+
227
+ } /* namespace datasketches */
228
+
229
+ #endif
@@ -0,0 +1,17 @@
1
+ [build-system]
2
+ requires = ["wheel",
3
+ "setuptools >= 30.3.0",
4
+ "setuptools_scm",
5
+ "cmake >= 3.12"]
6
+
7
+ [tool.tox]
8
+ legacy_tox_ini = """
9
+ [tox]
10
+ envlist = py3
11
+
12
+ [testenv]
13
+ deps = pytest
14
+ numpy
15
+ changedir = python/tests
16
+ commands = pytest
17
+ """
@@ -0,0 +1,61 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ # TODO: Can we force python version >= 3.0?
19
+ if (MSVC)
20
+ set(PYBIND11_CPP_STANDARD /std:c++11)
21
+ else()
22
+ set(PYBIND11_CPP_STANDARD -std=c++11)
23
+ endif()
24
+
25
+ add_subdirectory(pybind11)
26
+
27
+ pybind11_add_module(python MODULE EXCLUDE_FROM_ALL SYSTEM THIN_LTO)
28
+
29
+ target_link_libraries(python
30
+ PRIVATE
31
+ common
32
+ hll
33
+ kll
34
+ cpc
35
+ fi
36
+ theta
37
+ sampling
38
+ pybind11::module
39
+ )
40
+
41
+ set_target_properties(python PROPERTIES
42
+ PREFIX ""
43
+ OUTPUT_NAME datasketches
44
+ )
45
+
46
+ # ensure we make a .so on Mac rather than .dylib
47
+ if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
48
+ set_target_properties(python PROPERTIES SUFFIX ".so")
49
+ endif()
50
+
51
+ target_sources(python
52
+ PRIVATE
53
+ src/datasketches.cpp
54
+ src/hll_wrapper.cpp
55
+ src/kll_wrapper.cpp
56
+ src/cpc_wrapper.cpp
57
+ src/fi_wrapper.cpp
58
+ src/theta_wrapper.cpp
59
+ src/vo_wrapper.cpp
60
+ src/vector_of_kll.cpp
61
+ )
@@ -0,0 +1,78 @@
1
+ # Python Wrapper for Datasketches
2
+
3
+ ## Installation
4
+
5
+ The release files do not include the needed python binding library ([pybind11](https://github.com/pybind/pybind11)). If building
6
+ from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
7
+
8
+ An official pypi build is eventually planned but not yet available.
9
+
10
+ If you instead want to take a (possibly ill-advised) gamble on the current state of the master branch being useable, you can run:
11
+ ```pip install git+https://github.com/apache/datasketches-cpp.git```
12
+
13
+ ## Developer Instructions
14
+
15
+ ### Building
16
+
17
+ When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
18
+ ```
19
+ git clone --recursive https://github.com/apache/datasketches-cpp.git
20
+ cd datasketches-cpp
21
+ python -m pip install --upgrade pip setuptools wheel numpy
22
+ python setup.py build
23
+ ```
24
+
25
+ If you cloned without `--recursive`, you can add the submodule post-checkout using `git submodule update --init --recursive`.
26
+
27
+ ### Installing
28
+
29
+ Assuming you have already checked out the library and any dependent submodules, install by simply replacing the lsat
30
+ line of the build command with `python setup.py install`.
31
+
32
+ ### Unit tests
33
+
34
+ The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
35
+ ```
36
+ python -m pip install --upgrade pip setuptools wheel numpy tox
37
+ tox
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ Having installed the library, loading the Datasketches library in Python is simple: `import datasketches`.
43
+
44
+ ## Available Sketch Classes
45
+
46
+ - KLL
47
+ - `kll_ints_sketch`
48
+ - `kll_floats_sketch`
49
+ - Frequent Items
50
+ - `frequent_strings_sketch`
51
+ - Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
52
+ - Theta
53
+ - `update_theta_sketch`
54
+ - `compact_theta_sketch` (cannot be instantiated directly)
55
+ - `theta_union`
56
+ - `theta_intersection`
57
+ - `theta_a_not_b`
58
+ - HLL
59
+ - `hll_sketch`
60
+ - `hll_union`
61
+ - Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
62
+ - CPC
63
+ - `cpc_sketch`
64
+ - `cpc_union`
65
+ - VarOpt Sampling
66
+ - `var_opt_sketch`
67
+ - `var_opt_union`
68
+ - Vector of KLL
69
+ - `vector_of_kll_ints_sketches`
70
+ - `vector_of_kll_floats_sketches`
71
+
72
+ ## Known Differences from C++
73
+
74
+ The Python API largely mirrors the C++ API, with a few minor exceptions: The primary known differences are that Python on modern platforms does not support unsigned integer values or numeric values with fewer than 64 bits. As a result, you may not be able to produce identical sketches from within Python as you can with Java and C++. Loading those sketches after they have been serialized from another language will work as expected.
75
+
76
+ The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
77
+
78
+ We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
@@ -0,0 +1,345 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## CPC Sketch Examples"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "### Basic Sketch Usage"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "from datasketches import cpc_sketch, cpc_union"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {},
29
+ "source": [
30
+ "We'll create a sketch with log2(k) = 12"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 2,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "sk = cpc_sketch(12)"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "markdown",
44
+ "metadata": {},
45
+ "source": [
46
+ "Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes."
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 3,
52
+ "metadata": {},
53
+ "outputs": [
54
+ {
55
+ "name": "stdout",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "### CPC sketch summary:\n",
59
+ " lgK : 12\n",
60
+ " seed hash : 93cc\n",
61
+ " C : 38212\n",
62
+ " flavor : 4\n",
63
+ " merged : false\n",
64
+ " compressed : false\n",
65
+ " intresting col : 5\n",
66
+ " HIP estimate : 2.09721e+06\n",
67
+ " kxp : 11.4725\n",
68
+ " offset : 6\n",
69
+ " table : allocated\n",
70
+ " num SV : 135\n",
71
+ " window : allocated\n",
72
+ "### End sketch summary\n",
73
+ "\n"
74
+ ]
75
+ }
76
+ ],
77
+ "source": [
78
+ "n = 1 << 21\n",
79
+ "for i in range(0, n):\n",
80
+ " sk.update(i)\n",
81
+ "print(sk)"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "markdown",
86
+ "metadata": {},
87
+ "source": [
88
+ "Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation)."
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 4,
94
+ "metadata": {},
95
+ "outputs": [
96
+ {
97
+ "name": "stdout",
98
+ "output_type": "stream",
99
+ "text": [
100
+ "Upper bound (1 std. dev) as % of true value: 100.9281\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "print(\"Upper bound (1 std. dev) as % of true value: \", round(100*sk.get_upper_bound(1) / n, 4))"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 5,
111
+ "metadata": {},
112
+ "outputs": [
113
+ {
114
+ "name": "stdout",
115
+ "output_type": "stream",
116
+ "text": [
117
+ "Estimate as % of true value: 100.0026\n"
118
+ ]
119
+ }
120
+ ],
121
+ "source": [
122
+ "print(\"Estimate as % of true value: \", round(100*sk.get_estimate() / n, 4))"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 6,
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "name": "stdout",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "Lower bound (1 std. dev) as % of true value: 99.0935\n"
135
+ ]
136
+ }
137
+ ],
138
+ "source": [
139
+ "print(\"Lower bound (1 std. dev) as % of true value: \", round(100*sk.get_lower_bound(1) / n, 4))"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "markdown",
144
+ "metadata": {},
145
+ "source": [
146
+ "Finally, we can serialize and deserialize the sketch, which will give us back the same structure."
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 7,
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "data": {
156
+ "text/plain": [
157
+ "2484"
158
+ ]
159
+ },
160
+ "execution_count": 7,
161
+ "metadata": {},
162
+ "output_type": "execute_result"
163
+ }
164
+ ],
165
+ "source": [
166
+ "sk_bytes = sk.serialize()\n",
167
+ "len(sk_bytes)"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 8,
173
+ "metadata": {},
174
+ "outputs": [
175
+ {
176
+ "name": "stdout",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "### CPC sketch summary:\n",
180
+ " lgK : 12\n",
181
+ " seed hash : 93cc\n",
182
+ " C : 38212\n",
183
+ " flavor : 4\n",
184
+ " merged : false\n",
185
+ " compressed : false\n",
186
+ " intresting col : 5\n",
187
+ " HIP estimate : 2.09721e+06\n",
188
+ " kxp : 11.4725\n",
189
+ " offset : 6\n",
190
+ " table : allocated\n",
191
+ " num SV : 135\n",
192
+ " window : allocated\n",
193
+ "### End sketch summary\n",
194
+ "\n"
195
+ ]
196
+ }
197
+ ],
198
+ "source": [
199
+ "sk2 = cpc_sketch.deserialize(sk_bytes)\n",
200
+ "print(sk2)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "markdown",
205
+ "metadata": {},
206
+ "source": [
207
+ "### Sketch Union Usage"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "markdown",
212
+ "metadata": {},
213
+ "source": [
214
+ "Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data."
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 9,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "k = 12\n",
224
+ "n = 1 << 20\n",
225
+ "offset = int(3 * n / 4)"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 10,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "sk1 = cpc_sketch(k)\n",
235
+ "sk2 = cpc_sketch(k + 1)\n",
236
+ "for i in range(0, n):\n",
237
+ " sk1.update(i)\n",
238
+ " sk2.update(i + offset)"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "markdown",
243
+ "metadata": {},
244
+ "source": [
245
+ "Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here."
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 11,
251
+ "metadata": {},
252
+ "outputs": [],
253
+ "source": [
254
+ "union = cpc_union(k+1)\n",
255
+ "union.update(sk1)\n",
256
+ "union.update(sk2)"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "markdown",
261
+ "metadata": {},
262
+ "source": [
263
+ "Note how log config k has automatically adopted the value of the smaller input sketch."
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": 12,
269
+ "metadata": {},
270
+ "outputs": [
271
+ {
272
+ "name": "stdout",
273
+ "output_type": "stream",
274
+ "text": [
275
+ "### CPC sketch summary:\n",
276
+ " lgK : 12\n",
277
+ " seed hash : 93cc\n",
278
+ " C : 37418\n",
279
+ " flavor : 4\n",
280
+ " merged : true\n",
281
+ " compressed : false\n",
282
+ " intresting col : 5\n",
283
+ " HIP estimate : 0\n",
284
+ " kxp : 4096\n",
285
+ " offset : 6\n",
286
+ " table : allocated\n",
287
+ " num SV : 123\n",
288
+ " window : allocated\n",
289
+ "### End sketch summary\n",
290
+ "\n"
291
+ ]
292
+ }
293
+ ],
294
+ "source": [
295
+ "result = union.get_result()\n",
296
+ "print(result)"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "markdown",
301
+ "metadata": {},
302
+ "source": [
303
+ "We can again compare against the exact result, in this case 1.75*n"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 13,
309
+ "metadata": {},
310
+ "outputs": [
311
+ {
312
+ "name": "stdout",
313
+ "output_type": "stream",
314
+ "text": [
315
+ "Estimate as % of true value: 99.6646\n"
316
+ ]
317
+ }
318
+ ],
319
+ "source": [
320
+ "print(\"Estimate as % of true value: \", round(100*result.get_estimate() / (7*n/4), 4))"
321
+ ]
322
+ }
323
+ ],
324
+ "metadata": {
325
+ "kernelspec": {
326
+ "display_name": "Python 3",
327
+ "language": "python",
328
+ "name": "python3"
329
+ },
330
+ "language_info": {
331
+ "codemirror_mode": {
332
+ "name": "ipython",
333
+ "version": 3
334
+ },
335
+ "file_extension": ".py",
336
+ "mimetype": "text/x-python",
337
+ "name": "python",
338
+ "nbconvert_exporter": "python",
339
+ "pygments_lexer": "ipython3",
340
+ "version": "3.7.0"
341
+ }
342
+ },
343
+ "nbformat": 4,
344
+ "nbformat_minor": 2
345
+ }