datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,229 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <kll_sketch.hpp>
23
+ #include <kll_helper.hpp>
24
+
25
+ #include <assert.h>
26
+
27
+ #ifdef KLL_VALIDATION
28
+
29
+ // This is to make sure the implementation matches exactly the reference implementation in OCaml.
30
+ // Conditional compilation is used because the implementation needs a few modifications:
31
+ // - switch from random choice to deterministic
32
+ // - a few methods to expose internals of the sketch
33
+
34
+ namespace datasketches {
35
+
36
+ uint32_t kll_next_offset; // to make kll_sketch deterministic
37
+
38
+ constexpr unsigned num_tests = 114;
39
+
40
+ const int64_t correct_results[num_tests * 7] = {
41
+ 0, 200, 180, 3246533, 1, 180, 1098352976109474698,
42
+ 1, 200, 198, 8349603, 1, 198, 686681527497651888,
43
+ 2, 200, 217, 676491, 2, 117, 495856134049157644,
44
+ 3, 200, 238, 3204507, 2, 138, 44453438498725402,
45
+ 4, 200, 261, 2459373, 2, 161, 719830627391926938,
46
+ 5, 200, 287, 5902143, 2, 187, 389303173170515580,
47
+ 6, 200, 315, 5188793, 2, 215, 985218890825795000,
48
+ 7, 200, 346, 801923, 2, 246, 589362992166904413,
49
+ 8, 200, 380, 2466269, 2, 280, 1081848693781775853,
50
+ 9, 200, 418, 5968041, 2, 318, 533825689515788397,
51
+ 10, 200, 459, 3230027, 2, 243, 937332670315558786,
52
+ 11, 200, 504, 5125875, 2, 288, 1019197831515566845,
53
+ 12, 200, 554, 4195571, 3, 230, 797351479150148224,
54
+ 13, 200, 609, 2221181, 3, 285, 451246040374318529,
55
+ 14, 200, 669, 5865503, 3, 345, 253851269470815909,
56
+ 15, 200, 735, 831703, 3, 411, 491974970526372303,
57
+ 16, 200, 808, 4830785, 3, 327, 1032107507126916277,
58
+ 17, 200, 888, 1356257, 3, 407, 215225420986342944,
59
+ 18, 200, 976, 952071, 3, 417, 600280049738270697,
60
+ 19, 200, 1073, 6729833, 3, 397, 341758522977365969,
61
+ 20, 200, 1180, 6017925, 3, 406, 1080227312339182949,
62
+ 21, 200, 1298, 4229891, 3, 401, 1092460534756675086,
63
+ 22, 200, 1427, 7264889, 4, 320, 884533400696890024,
64
+ 23, 200, 1569, 5836327, 4, 462, 660575800011134382,
65
+ 24, 200, 1725, 5950087, 4, 416, 669373957401387528,
66
+ 25, 200, 1897, 2692555, 4, 406, 607308667566496888,
67
+ 26, 200, 2086, 1512443, 4, 459, 744260340112029032,
68
+ 27, 200, 2294, 2681171, 4, 434, 199120609113802485,
69
+ 28, 200, 2523, 3726521, 4, 450, 570993497599288304,
70
+ 29, 200, 2775, 2695247, 4, 442, 306717093329516310,
71
+ 30, 200, 3052, 5751175, 5, 400, 256024589545754217,
72
+ 31, 200, 3357, 1148897, 5, 514, 507276662329207479,
73
+ 32, 200, 3692, 484127, 5, 457, 1082660223488175122,
74
+ 33, 200, 4061, 6414559, 5, 451, 620820308918522117,
75
+ 34, 200, 4467, 5587461, 5, 466, 121975084804459305,
76
+ 35, 200, 4913, 1615017, 5, 483, 152986529342916376,
77
+ 36, 200, 5404, 6508535, 5, 492, 858526451332425960,
78
+ 37, 200, 5944, 2991657, 5, 492, 624906434274621995,
79
+ 38, 200, 6538, 6736565, 6, 511, 589153542019036049,
80
+ 39, 200, 7191, 1579893, 6, 507, 10255312374117907,
81
+ 40, 200, 7910, 412509, 6, 538, 570863587164194186,
82
+ 41, 200, 8701, 1112089, 6, 477, 553100668286355347,
83
+ 42, 200, 9571, 1258813, 6, 526, 344845406406036297,
84
+ 43, 200, 10528, 1980049, 6, 508, 411846569527905064,
85
+ 44, 200, 11580, 2167127, 6, 520, 966876726203675488,
86
+ 45, 200, 12738, 1975435, 7, 561, 724125506920592732,
87
+ 46, 200, 14011, 4289627, 7, 560, 753686005174215572,
88
+ 47, 200, 15412, 5384001, 7, 494, 551637841878573955,
89
+ 48, 200, 16953, 2902685, 7, 560, 94602851752354802,
90
+ 49, 200, 18648, 4806445, 7, 562, 597672400688514221,
91
+ 50, 200, 20512, 2085, 7, 529, 417280161591969960,
92
+ 51, 200, 22563, 6375939, 7, 558, 11300453985206678,
93
+ 52, 200, 24819, 7837057, 7, 559, 283668599967437754,
94
+ 53, 200, 27300, 6607975, 8, 561, 122183647493325363,
95
+ 54, 200, 30030, 1519191, 8, 550, 1145227891427321202,
96
+ 55, 200, 33033, 808061, 8, 568, 71070843834364939,
97
+ 56, 200, 36336, 2653529, 8, 570, 450311772805359006,
98
+ 57, 200, 39969, 2188957, 8, 561, 269670427054904115,
99
+ 58, 200, 43965, 5885655, 8, 539, 1039064186324091890,
100
+ 59, 200, 48361, 6185889, 8, 574, 178055275082387938,
101
+ 60, 200, 53197, 208767, 9, 579, 139766040442973048,
102
+ 61, 200, 58516, 2551345, 9, 569, 322655279254252950,
103
+ 62, 200, 64367, 1950873, 9, 569, 101542216315768285,
104
+ 63, 200, 70803, 2950429, 9, 582, 72294008568551853,
105
+ 64, 200, 77883, 3993977, 9, 572, 299014330559512530,
106
+ 65, 200, 85671, 428871, 9, 585, 491351721800568188,
107
+ 66, 200, 94238, 6740849, 9, 577, 656204268858348899,
108
+ 67, 200, 103661, 2315497, 9, 562, 829926273188300764,
109
+ 68, 200, 114027, 5212835, 10, 581, 542222554617639557,
110
+ 69, 200, 125429, 4213475, 10, 593, 713339189579860773,
111
+ 70, 200, 137971, 2411583, 10, 592, 649651658985845357,
112
+ 71, 200, 151768, 5243307, 10, 567, 1017459402785275179,
113
+ 72, 200, 166944, 2468367, 10, 593, 115034451827634398,
114
+ 73, 200, 183638, 2210923, 10, 583, 365735165000548572,
115
+ 74, 200, 202001, 321257, 10, 591, 928479940794929153,
116
+ 75, 200, 222201, 8185105, 11, 600, 780163958693677795,
117
+ 76, 200, 244421, 6205349, 11, 598, 132454307780236135,
118
+ 77, 200, 268863, 3165901, 11, 600, 369824066179493948,
119
+ 78, 200, 295749, 2831723, 11, 595, 80968411797441666,
120
+ 79, 200, 325323, 464193, 11, 594, 125773061716381917,
121
+ 80, 200, 357855, 7499035, 11, 576, 994150328579932916,
122
+ 81, 200, 393640, 1514479, 11, 596, 111092193875842594,
123
+ 82, 200, 433004, 668493, 12, 607, 497338041653302784,
124
+ 83, 200, 476304, 3174931, 12, 606, 845986926165673887,
125
+ 84, 200, 523934, 914611, 12, 605, 354993119685278556,
126
+ 85, 200, 576327, 7270385, 12, 602, 937679531753465428,
127
+ 86, 200, 633959, 1956979, 12, 598, 659413123921208266,
128
+ 87, 200, 697354, 3137635, 12, 606, 874228711599628459,
129
+ 88, 200, 767089, 214923, 12, 608, 1077644643342432307,
130
+ 89, 200, 843797, 3084545, 13, 612, 79317113064339979,
131
+ 90, 200, 928176, 7800899, 13, 612, 357414065779796772,
132
+ 91, 200, 1020993, 6717253, 13, 615, 532723577905833296,
133
+ 92, 200, 1123092, 5543015, 13, 614, 508695073250223746,
134
+ 93, 200, 1235401, 298785, 13, 616, 34344606952783179,
135
+ 94, 200, 1358941, 4530313, 13, 607, 169924026179364121,
136
+ 95, 200, 1494835, 4406457, 13, 612, 1026773494313671061,
137
+ 96, 200, 1644318, 1540983, 13, 614, 423454640036650614,
138
+ 97, 200, 1808749, 7999631, 14, 624, 466122870338520329,
139
+ 98, 200, 1989623, 4295537, 14, 621, 609309853701283445,
140
+ 99, 200, 2188585, 7379971, 14, 622, 141739898871015642,
141
+ 100, 200, 2407443, 6188931, 14, 621, 22515080776738923,
142
+ 101, 200, 2648187, 6701239, 14, 619, 257441864177795548,
143
+ 102, 200, 2913005, 2238709, 14, 623, 867028825821064773,
144
+ 103, 200, 3204305, 5371075, 14, 625, 1110615471273395112,
145
+ 104, 200, 3524735, 7017341, 15, 631, 619518037415974467,
146
+ 105, 200, 3877208, 323337, 15, 633, 513230912593541122,
147
+ 106, 200, 4264928, 6172471, 15, 628, 885861662583325072,
148
+ 107, 200, 4691420, 5653803, 15, 633, 754052473303005204,
149
+ 108, 200, 5160562, 1385265, 15, 630, 294993765757975100,
150
+ 109, 200, 5676618, 4350899, 15, 617, 1073144684944932303,
151
+ 110, 200, 6244279, 1272235, 15, 630, 308982934296855020,
152
+ 111, 200, 6868706, 1763939, 16, 638, 356231694823272867,
153
+ 112, 200, 7555576, 3703411, 16, 636, 20043268926300101,
154
+ 113, 200, 8311133, 6554171, 16, 637, 121111429906734123
155
+ };
156
+
157
+ static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
158
+ assert (kll_helper::is_odd(stride));
159
+ unsigned mask((1 << 23) - 1); // because library items are single-precision floats at the moment
160
+ unsigned cur(0);
161
+ std::unique_ptr<int[]> arr(new int[n]);
162
+ for (unsigned i = 0; i < n; i++) {
163
+ cur += stride;
164
+ cur &= mask;
165
+ arr[i] = cur;
166
+ }
167
+ return arr;
168
+ }
169
+
170
+ static int64_t simple_hash_of_sub_array(const float* arr, unsigned start, unsigned length) {
171
+ int64_t multiplier(738219921); // an arbitrary odd 30-bit number
172
+ int64_t mask60((1ULL << 60) - 1ULL);
173
+ int64_t accum(0);
174
+ for (unsigned i = start; i < start + length; i++) {
175
+ accum += (int64_t) arr[i];
176
+ accum *= multiplier;
177
+ accum &= mask60;
178
+ accum ^= accum >> 30;
179
+ }
180
+ return accum;
181
+ }
182
+
183
+ TEST_CASE("kll validation", "[kll_sketch][validation]") {
184
+ for (unsigned i = 0; i < num_tests; i++) {
185
+ assert (correct_results[7 * i] == i);
186
+ unsigned k(correct_results[7 * i + 1]);
187
+ unsigned n(correct_results[7 * i + 2]);
188
+ unsigned stride(correct_results[7 * i + 3]);
189
+ std::unique_ptr<int[]> input_array = make_input_array(n, stride);
190
+ kll_sketch<float> sketch(k);
191
+ kll_next_offset = 0;
192
+ for (unsigned j = 0; j < n; j++) {
193
+ sketch.update(input_array[j]);
194
+ }
195
+ unsigned num_levels = sketch.get_num_levels();
196
+ unsigned num_samples = sketch.get_num_retained();
197
+ int64_t hashed_samples = simple_hash_of_sub_array(sketch.get_items(), sketch.get_levels()[0], num_samples);
198
+ std::cout << i;
199
+ REQUIRE(correct_results[7 * i + 4] == num_levels);
200
+ REQUIRE(correct_results[7 * i + 5] == num_samples);
201
+ if (correct_results[7 * i + 6] == hashed_samples) {
202
+ std::cout << " pass" << std::endl;
203
+ } else {
204
+ std::cout << " " << (correct_results[7 * i + 6]) << " != " << hashed_samples;
205
+ sketch.to_stream(std::cout);
206
+ FAIL();
207
+ }
208
+ }
209
+ }
210
+
211
+ TEST_CASE("kll validation: test hash", "[kll_sketch][validaiton]") {
212
+ float array[] = { 907500, 944104, 807020, 219921, 678370, 955217, 426885 };
213
+ REQUIRE(simple_hash_of_sub_array(array, 1, 5) == 1141543353991880193LL);
214
+ }
215
+
216
+ TEST_CASE("kll validation: make input array", "[kll_sketch][validaiton]") {
217
+ int expected_array[6] = { 3654721, 7309442, 2575555, 6230276, 1496389, 5151110 };
218
+ auto array(make_input_array(6, 3654721));
219
+ REQUIRE(array[0] == expected_array[0]);
220
+ REQUIRE(array[1] == expected_array[1]);
221
+ REQUIRE(array[2] == expected_array[2]);
222
+ REQUIRE(array[3] == expected_array[3]);
223
+ REQUIRE(array[4] == expected_array[4]);
224
+ REQUIRE(array[5] == expected_array[5]);
225
+ }
226
+
227
+ } /* namespace datasketches */
228
+
229
+ #endif
@@ -0,0 +1,17 @@
1
+ [build-system]
2
+ requires = ["wheel",
3
+ "setuptools >= 30.3.0",
4
+ "setuptools_scm",
5
+ "cmake >= 3.12"]
6
+
7
+ [tool.tox]
8
+ legacy_tox_ini = """
9
+ [tox]
10
+ envlist = py3
11
+
12
+ [testenv]
13
+ deps = pytest
14
+ numpy
15
+ changedir = python/tests
16
+ commands = pytest
17
+ """
@@ -0,0 +1,61 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ # TODO: Can we force python version >= 3.0?
19
+ if (MSVC)
20
+ set(PYBIND11_CPP_STANDARD /std:c++11)
21
+ else()
22
+ set(PYBIND11_CPP_STANDARD -std=c++11)
23
+ endif()
24
+
25
+ add_subdirectory(pybind11)
26
+
27
+ pybind11_add_module(python MODULE EXCLUDE_FROM_ALL SYSTEM THIN_LTO)
28
+
29
+ target_link_libraries(python
30
+ PRIVATE
31
+ common
32
+ hll
33
+ kll
34
+ cpc
35
+ fi
36
+ theta
37
+ sampling
38
+ pybind11::module
39
+ )
40
+
41
+ set_target_properties(python PROPERTIES
42
+ PREFIX ""
43
+ OUTPUT_NAME datasketches
44
+ )
45
+
46
+ # ensure we make a .so on Mac rather than .dylib
47
+ if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
48
+ set_target_properties(python PROPERTIES SUFFIX ".so")
49
+ endif()
50
+
51
+ target_sources(python
52
+ PRIVATE
53
+ src/datasketches.cpp
54
+ src/hll_wrapper.cpp
55
+ src/kll_wrapper.cpp
56
+ src/cpc_wrapper.cpp
57
+ src/fi_wrapper.cpp
58
+ src/theta_wrapper.cpp
59
+ src/vo_wrapper.cpp
60
+ src/vector_of_kll.cpp
61
+ )
@@ -0,0 +1,78 @@
1
+ # Python Wrapper for Datasketches
2
+
3
+ ## Installation
4
+
5
+ The release files do not include the needed python binding library ([pybind11](https://github.com/pybind/pybind11)). If building
6
+ from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
7
+
8
+ An official pypi build is eventually planned but not yet available.
9
+
10
+ If you instead want to take a (possibly ill-advised) gamble on the current state of the master branch being useable, you can run:
11
+ ```pip install git+https://github.com/apache/datasketches-cpp.git```
12
+
13
+ ## Developer Instructions
14
+
15
+ ### Building
16
+
17
+ When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
18
+ ```
19
+ git clone --recursive https://github.com/apache/datasketches-cpp.git
20
+ cd datasketches-cpp
21
+ python -m pip install --upgrade pip setuptools wheel numpy
22
+ python setup.py build
23
+ ```
24
+
25
+ If you cloned without `--recursive`, you can add the submodule post-checkout using `git submodule update --init --recursive`.
26
+
27
+ ### Installing
28
+
29
+ Assuming you have already checked out the library and any dependent submodules, install by simply replacing the lsat
30
+ line of the build command with `python setup.py install`.
31
+
32
+ ### Unit tests
33
+
34
+ The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
35
+ ```
36
+ python -m pip install --upgrade pip setuptools wheel numpy tox
37
+ tox
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ Having installed the library, loading the Datasketches library in Python is simple: `import datasketches`.
43
+
44
+ ## Available Sketch Classes
45
+
46
+ - KLL
47
+ - `kll_ints_sketch`
48
+ - `kll_floats_sketch`
49
+ - Frequent Items
50
+ - `frequent_strings_sketch`
51
+ - Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
52
+ - Theta
53
+ - `update_theta_sketch`
54
+ - `compact_theta_sketch` (cannot be instantiated directly)
55
+ - `theta_union`
56
+ - `theta_intersection`
57
+ - `theta_a_not_b`
58
+ - HLL
59
+ - `hll_sketch`
60
+ - `hll_union`
61
+ - Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
62
+ - CPC
63
+ - `cpc_sketch`
64
+ - `cpc_union`
65
+ - VarOpt Sampling
66
+ - `var_opt_sketch`
67
+ - `var_opt_union`
68
+ - Vector of KLL
69
+ - `vector_of_kll_ints_sketches`
70
+ - `vector_of_kll_floats_sketches`
71
+
72
+ ## Known Differences from C++
73
+
74
+ The Python API largely mirrors the C++ API, with a few minor exceptions: The primary known differences are that Python on modern platforms does not support unsigned integer values or numeric values with fewer than 64 bits. As a result, you may not be able to produce identical sketches from within Python as you can with Java and C++. Loading those sketches after they have been serialized from another language will work as expected.
75
+
76
+ The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
77
+
78
+ We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
@@ -0,0 +1,345 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## CPC Sketch Examples"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "### Basic Sketch Usage"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "from datasketches import cpc_sketch, cpc_union"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {},
29
+ "source": [
30
+ "We'll create a sketch with log2(k) = 12"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 2,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "sk = cpc_sketch(12)"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "markdown",
44
+ "metadata": {},
45
+ "source": [
46
+ "Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes."
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 3,
52
+ "metadata": {},
53
+ "outputs": [
54
+ {
55
+ "name": "stdout",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "### CPC sketch summary:\n",
59
+ " lgK : 12\n",
60
+ " seed hash : 93cc\n",
61
+ " C : 38212\n",
62
+ " flavor : 4\n",
63
+ " merged : false\n",
64
+ " compressed : false\n",
65
+ " intresting col : 5\n",
66
+ " HIP estimate : 2.09721e+06\n",
67
+ " kxp : 11.4725\n",
68
+ " offset : 6\n",
69
+ " table : allocated\n",
70
+ " num SV : 135\n",
71
+ " window : allocated\n",
72
+ "### End sketch summary\n",
73
+ "\n"
74
+ ]
75
+ }
76
+ ],
77
+ "source": [
78
+ "n = 1 << 21\n",
79
+ "for i in range(0, n):\n",
80
+ " sk.update(i)\n",
81
+ "print(sk)"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "markdown",
86
+ "metadata": {},
87
+ "source": [
88
+ "Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation)."
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 4,
94
+ "metadata": {},
95
+ "outputs": [
96
+ {
97
+ "name": "stdout",
98
+ "output_type": "stream",
99
+ "text": [
100
+ "Upper bound (1 std. dev) as % of true value: 100.9281\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "print(\"Upper bound (1 std. dev) as % of true value: \", round(100*sk.get_upper_bound(1) / n, 4))"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 5,
111
+ "metadata": {},
112
+ "outputs": [
113
+ {
114
+ "name": "stdout",
115
+ "output_type": "stream",
116
+ "text": [
117
+ "Estimate as % of true value: 100.0026\n"
118
+ ]
119
+ }
120
+ ],
121
+ "source": [
122
+ "print(\"Estimate as % of true value: \", round(100*sk.get_estimate() / n, 4))"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 6,
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "name": "stdout",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "Lower bound (1 std. dev) as % of true value: 99.0935\n"
135
+ ]
136
+ }
137
+ ],
138
+ "source": [
139
+ "print(\"Lower bound (1 std. dev) as % of true value: \", round(100*sk.get_lower_bound(1) / n, 4))"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "markdown",
144
+ "metadata": {},
145
+ "source": [
146
+ "Finally, we can serialize and deserialize the sketch, which will give us back the same structure."
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 7,
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "data": {
156
+ "text/plain": [
157
+ "2484"
158
+ ]
159
+ },
160
+ "execution_count": 7,
161
+ "metadata": {},
162
+ "output_type": "execute_result"
163
+ }
164
+ ],
165
+ "source": [
166
+ "sk_bytes = sk.serialize()\n",
167
+ "len(sk_bytes)"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 8,
173
+ "metadata": {},
174
+ "outputs": [
175
+ {
176
+ "name": "stdout",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "### CPC sketch summary:\n",
180
+ " lgK : 12\n",
181
+ " seed hash : 93cc\n",
182
+ " C : 38212\n",
183
+ " flavor : 4\n",
184
+ " merged : false\n",
185
+ " compressed : false\n",
186
+ " intresting col : 5\n",
187
+ " HIP estimate : 2.09721e+06\n",
188
+ " kxp : 11.4725\n",
189
+ " offset : 6\n",
190
+ " table : allocated\n",
191
+ " num SV : 135\n",
192
+ " window : allocated\n",
193
+ "### End sketch summary\n",
194
+ "\n"
195
+ ]
196
+ }
197
+ ],
198
+ "source": [
199
+ "sk2 = cpc_sketch.deserialize(sk_bytes)\n",
200
+ "print(sk2)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "markdown",
205
+ "metadata": {},
206
+ "source": [
207
+ "### Sketch Union Usage"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "markdown",
212
+ "metadata": {},
213
+ "source": [
214
+ "Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data."
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 9,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "k = 12\n",
224
+ "n = 1 << 20\n",
225
+ "offset = int(3 * n / 4)"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 10,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "sk1 = cpc_sketch(k)\n",
235
+ "sk2 = cpc_sketch(k + 1)\n",
236
+ "for i in range(0, n):\n",
237
+ " sk1.update(i)\n",
238
+ " sk2.update(i + offset)"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "markdown",
243
+ "metadata": {},
244
+ "source": [
245
+ "Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here."
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 11,
251
+ "metadata": {},
252
+ "outputs": [],
253
+ "source": [
254
+ "union = cpc_union(k+1)\n",
255
+ "union.update(sk1)\n",
256
+ "union.update(sk2)"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "markdown",
261
+ "metadata": {},
262
+ "source": [
263
+ "Note how log config k has automatically adopted the value of the smaller input sketch."
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": 12,
269
+ "metadata": {},
270
+ "outputs": [
271
+ {
272
+ "name": "stdout",
273
+ "output_type": "stream",
274
+ "text": [
275
+ "### CPC sketch summary:\n",
276
+ " lgK : 12\n",
277
+ " seed hash : 93cc\n",
278
+ " C : 37418\n",
279
+ " flavor : 4\n",
280
+ " merged : true\n",
281
+ " compressed : false\n",
282
+ " intresting col : 5\n",
283
+ " HIP estimate : 0\n",
284
+ " kxp : 4096\n",
285
+ " offset : 6\n",
286
+ " table : allocated\n",
287
+ " num SV : 123\n",
288
+ " window : allocated\n",
289
+ "### End sketch summary\n",
290
+ "\n"
291
+ ]
292
+ }
293
+ ],
294
+ "source": [
295
+ "result = union.get_result()\n",
296
+ "print(result)"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "markdown",
301
+ "metadata": {},
302
+ "source": [
303
+ "We can again compare against the exact result, in this case 1.75*n"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 13,
309
+ "metadata": {},
310
+ "outputs": [
311
+ {
312
+ "name": "stdout",
313
+ "output_type": "stream",
314
+ "text": [
315
+ "Estimate as % of true value: 99.6646\n"
316
+ ]
317
+ }
318
+ ],
319
+ "source": [
320
+ "print(\"Estimate as % of true value: \", round(100*result.get_estimate() / (7*n/4), 4))"
321
+ ]
322
+ }
323
+ ],
324
+ "metadata": {
325
+ "kernelspec": {
326
+ "display_name": "Python 3",
327
+ "language": "python",
328
+ "name": "python3"
329
+ },
330
+ "language_info": {
331
+ "codemirror_mode": {
332
+ "name": "ipython",
333
+ "version": 3
334
+ },
335
+ "file_extension": ".py",
336
+ "mimetype": "text/x-python",
337
+ "name": "python",
338
+ "nbconvert_exporter": "python",
339
+ "pygments_lexer": "ipython3",
340
+ "version": "3.7.0"
341
+ }
342
+ },
343
+ "nbformat": 4,
344
+ "nbformat_minor": 2
345
+ }