datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,104 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #ifndef _QUANTILE_CONDITIONAL_HPP_
21
- #define _QUANTILE_CONDITIONAL_HPP_
22
-
23
- /*
24
- This header defines conditionally compiled functions shared
25
- across the set of quantile family sketches.
26
- */
27
-
28
- #include "common_defs.hpp"
29
- #include "py_serde.hpp"
30
-
31
- #include <pybind11/pybind11.h>
32
- #include <pybind11/numpy.h>
33
-
34
- namespace py = pybind11;
35
-
36
- // Serialization
37
- // std::string and arithmetic types, where we don't need a separate serde
38
- template<typename T, typename SK, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type = 0>
39
- void add_serialization(py::class_<SK>& clazz) {
40
- clazz.def(
41
- "serialize",
42
- [](const SK& sk) {
43
- auto bytes = sk.serialize();
44
- return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
45
- },
46
- "Serializes the sketch into a bytes object."
47
- )
48
- .def_static(
49
- "deserialize",
50
- [](const std::string& bytes) { return SK::deserialize(bytes.data(), bytes.size()); },
51
- py::arg("bytes"),
52
- "Deserializes the sketch from a bytes object."
53
- );
54
- }
55
-
56
- // py::object and other types where the caller must provide a serde
57
- template<typename T, typename SK, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type = 0>
58
- void add_serialization(py::class_<SK>& clazz) {
59
- clazz.def(
60
- "serialize",
61
- [](const SK& sk, datasketches::py_object_serde& serde) {
62
- auto bytes = sk.serialize(0, serde);
63
- return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
64
- }, py::arg("serde"),
65
- "Serializes the sketch into a bytes object using the provided serde."
66
- )
67
- .def_static(
68
- "deserialize",
69
- [](const std::string& bytes, datasketches::py_object_serde& serde) {
70
- return SK::deserialize(bytes.data(), bytes.size(), serde);
71
- }, py::arg("bytes"), py::arg("serde"),
72
- "Deserializes the sketch from a bytes object using the provided serde."
73
- );
74
- }
75
-
76
- // Vector Updates
77
- // * Only allowed for POD types based on numpy restriction, which
78
- // is equivalent to both std::is_trivial and std::is_standard_layout.
79
- // * Nothing is added to types that are not PODs.
80
- // POD type
81
- template<typename T, typename SK, typename std::enable_if<std::is_trivial<T>::value && std::is_standard_layout<T>::value, bool>::type = 0>
82
- void add_vector_update(py::class_<SK>& clazz) {
83
- clazz.def(
84
- "update",
85
- [](SK& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
86
- if (items.ndim() != 1) {
87
- throw std::invalid_argument("input data must have only one dimension. Found: "
88
- + std::to_string(items.ndim()));
89
- }
90
- auto array = items.template unchecked<1>();
91
- for (uint32_t i = 0; i < array.size(); ++i) sk.update(array(i));
92
- },
93
- py::arg("array"),
94
- "Updates the sketch with the values in the given array"
95
- );
96
- }
97
-
98
- // non-POD type
99
- template<typename T, typename SK, typename std::enable_if<!std::is_trivial<T>::value || !std::is_standard_layout<T>::value, bool>::type = 0>
100
- void add_vector_update(py::class_<SK>& clazz) {
101
- unused(clazz);
102
- }
103
-
104
- #endif // _QUANTILE_CONDITIONAL_HPP_
@@ -1,136 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include <memory>
21
- #include <pybind11/pybind11.h>
22
-
23
- #ifndef _TUPLE_POLICY_HPP_
24
- #define _TUPLE_POLICY_HPP_
25
-
26
- namespace py = pybind11;
27
-
28
- namespace datasketches {
29
-
30
- /**
31
- * @brief tuple_policy provides the underlying base class from
32
- * which native Python policies ultimately inherit. The actual
33
- * policies implement TuplePolicy, as shown in TuplePolicy.py
34
- */
35
- struct tuple_policy {
36
- virtual py::object create_summary() const = 0;
37
- virtual py::object update_summary(py::object& summary, const py::object& update) const = 0;
38
- virtual py::object operator()(py::object& summary, const py::object& update) const = 0;
39
- virtual ~tuple_policy() = default;
40
- };
41
-
42
- /**
43
- * @brief TuplePolicy provides the "trampoline" class for pybind11
44
- * that allows for a native Python implementation of tuple
45
- * sketch policies.
46
- */
47
- struct TuplePolicy : public tuple_policy {
48
- using tuple_policy::tuple_policy;
49
-
50
- /**
51
- * @brief Create a summary object
52
- *
53
- * @return py::object representing a new summary
54
- */
55
- py::object create_summary() const override {
56
- PYBIND11_OVERRIDE_PURE(
57
- py::object, // Return type
58
- tuple_policy, // Parent class
59
- create_summary, // Name of function in C++ (must match Python name)
60
- // Argument(s) -- if any
61
- );
62
- }
63
-
64
- /**
65
- * @brief Update a summary object using this policy
66
- *
67
- * @param summary The current summary to update
68
- * @param update The new value with which to update the summary
69
- * @return py::object The updated summary
70
- */
71
- py::object update_summary(py::object& summary, const py::object& update) const override {
72
- PYBIND11_OVERRIDE_PURE(
73
- py::object, // Return type
74
- tuple_policy, // Parent class
75
- update_summary, // Name of function in C++ (must match Python name)
76
- summary, update // Arguments
77
- );
78
- }
79
-
80
- /**
81
- * @brief Applies this policy to summary with the provided update
82
- *
83
- * @param summary The current summary on which to apply the policy
84
- * @param update An update to apply to the current summary
85
- * @return py::object The potentially modified summary
86
- */
87
- py::object operator()(py::object& summary, const py::object& update) const override {
88
- PYBIND11_OVERRIDE_PURE_NAME(
89
- py::object, // Return type
90
- tuple_policy, // Parent class
91
- "__call__", // Name of function in python
92
- operator(), // Name of function in C++
93
- summary, update // Arguemnts
94
- );
95
- }
96
- };
97
-
98
- /* The tuple_policy_holder provides a concrete class that dispatches calls
99
- * from the sketch to the tuple_policy. This class is needed to provide a
100
- * concrete object to produce a compiled library, but library users should
101
- * never need to use this directly.
102
- */
103
- struct tuple_policy_holder {
104
- explicit tuple_policy_holder(std::shared_ptr<tuple_policy> policy) : _policy(policy) {}
105
- tuple_policy_holder(const tuple_policy_holder& other) : _policy(other._policy) {}
106
- tuple_policy_holder(tuple_policy_holder&& other) : _policy(std::move(other._policy)) {}
107
- tuple_policy_holder& operator=(const tuple_policy_holder& other) { _policy = other._policy; return *this; }
108
- tuple_policy_holder& operator=(tuple_policy_holder&& other) { std::swap(_policy, other._policy); return *this; }
109
-
110
- py::object create() const { return _policy->create_summary(); }
111
-
112
- void update(py::object& summary, const py::object& update) const {
113
- summary = _policy->update_summary(summary, update);
114
- }
115
-
116
- void operator()(py::object& summary, const py::object& update) const {
117
- summary = _policy->operator()(summary, update);
118
- }
119
-
120
- private:
121
- std::shared_ptr<tuple_policy> _policy;
122
- };
123
-
124
- /* A degenerate policy used to enable Jaccard Similarity on tuple sketches,
125
- * where the computation requires a union and intersection over the keys but
126
- * does not need to observe the summaries.
127
- */
128
- struct dummy_jaccard_policy {
129
- void operator()(py::object&, const py::object&) const {
130
- return;
131
- }
132
- };
133
-
134
- }
135
-
136
- #endif // _TUPLE_POLICY_HPP_
@@ -1,345 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "## CPC Sketch Examples"
8
- ]
9
- },
10
- {
11
- "cell_type": "markdown",
12
- "metadata": {},
13
- "source": [
14
- "### Basic Sketch Usage"
15
- ]
16
- },
17
- {
18
- "cell_type": "code",
19
- "execution_count": 1,
20
- "metadata": {},
21
- "outputs": [],
22
- "source": [
23
- "from datasketches import cpc_sketch, cpc_union"
24
- ]
25
- },
26
- {
27
- "cell_type": "markdown",
28
- "metadata": {},
29
- "source": [
30
- "We'll create a sketch with log2(k) = 12"
31
- ]
32
- },
33
- {
34
- "cell_type": "code",
35
- "execution_count": 2,
36
- "metadata": {},
37
- "outputs": [],
38
- "source": [
39
- "sk = cpc_sketch(12)"
40
- ]
41
- },
42
- {
43
- "cell_type": "markdown",
44
- "metadata": {},
45
- "source": [
46
- "Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes."
47
- ]
48
- },
49
- {
50
- "cell_type": "code",
51
- "execution_count": 3,
52
- "metadata": {},
53
- "outputs": [
54
- {
55
- "name": "stdout",
56
- "output_type": "stream",
57
- "text": [
58
- "### CPC sketch summary:\n",
59
- " lgK : 12\n",
60
- " seed hash : 93cc\n",
61
- " C : 38212\n",
62
- " flavor : 4\n",
63
- " merged : false\n",
64
- " compressed : false\n",
65
- " intresting col : 5\n",
66
- " HIP estimate : 2.09721e+06\n",
67
- " kxp : 11.4725\n",
68
- " offset : 6\n",
69
- " table : allocated\n",
70
- " num SV : 135\n",
71
- " window : allocated\n",
72
- "### End sketch summary\n",
73
- "\n"
74
- ]
75
- }
76
- ],
77
- "source": [
78
- "n = 1 << 21\n",
79
- "for i in range(0, n):\n",
80
- " sk.update(i)\n",
81
- "print(sk)"
82
- ]
83
- },
84
- {
85
- "cell_type": "markdown",
86
- "metadata": {},
87
- "source": [
88
- "Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation)."
89
- ]
90
- },
91
- {
92
- "cell_type": "code",
93
- "execution_count": 4,
94
- "metadata": {},
95
- "outputs": [
96
- {
97
- "name": "stdout",
98
- "output_type": "stream",
99
- "text": [
100
- "Upper bound (1 std. dev) as % of true value: 100.9281\n"
101
- ]
102
- }
103
- ],
104
- "source": [
105
- "print(\"Upper bound (1 std. dev) as % of true value: \", round(100*sk.get_upper_bound(1) / n, 4))"
106
- ]
107
- },
108
- {
109
- "cell_type": "code",
110
- "execution_count": 5,
111
- "metadata": {},
112
- "outputs": [
113
- {
114
- "name": "stdout",
115
- "output_type": "stream",
116
- "text": [
117
- "Estimate as % of true value: 100.0026\n"
118
- ]
119
- }
120
- ],
121
- "source": [
122
- "print(\"Estimate as % of true value: \", round(100*sk.get_estimate() / n, 4))"
123
- ]
124
- },
125
- {
126
- "cell_type": "code",
127
- "execution_count": 6,
128
- "metadata": {},
129
- "outputs": [
130
- {
131
- "name": "stdout",
132
- "output_type": "stream",
133
- "text": [
134
- "Lower bound (1 std. dev) as % of true value: 99.0935\n"
135
- ]
136
- }
137
- ],
138
- "source": [
139
- "print(\"Lower bound (1 std. dev) as % of true value: \", round(100*sk.get_lower_bound(1) / n, 4))"
140
- ]
141
- },
142
- {
143
- "cell_type": "markdown",
144
- "metadata": {},
145
- "source": [
146
- "Finally, we can serialize and deserialize the sketch, which will give us back the same structure."
147
- ]
148
- },
149
- {
150
- "cell_type": "code",
151
- "execution_count": 7,
152
- "metadata": {},
153
- "outputs": [
154
- {
155
- "data": {
156
- "text/plain": [
157
- "2484"
158
- ]
159
- },
160
- "execution_count": 7,
161
- "metadata": {},
162
- "output_type": "execute_result"
163
- }
164
- ],
165
- "source": [
166
- "sk_bytes = sk.serialize()\n",
167
- "len(sk_bytes)"
168
- ]
169
- },
170
- {
171
- "cell_type": "code",
172
- "execution_count": 8,
173
- "metadata": {},
174
- "outputs": [
175
- {
176
- "name": "stdout",
177
- "output_type": "stream",
178
- "text": [
179
- "### CPC sketch summary:\n",
180
- " lgK : 12\n",
181
- " seed hash : 93cc\n",
182
- " C : 38212\n",
183
- " flavor : 4\n",
184
- " merged : false\n",
185
- " compressed : false\n",
186
- " intresting col : 5\n",
187
- " HIP estimate : 2.09721e+06\n",
188
- " kxp : 11.4725\n",
189
- " offset : 6\n",
190
- " table : allocated\n",
191
- " num SV : 135\n",
192
- " window : allocated\n",
193
- "### End sketch summary\n",
194
- "\n"
195
- ]
196
- }
197
- ],
198
- "source": [
199
- "sk2 = cpc_sketch.deserialize(sk_bytes)\n",
200
- "print(sk2)"
201
- ]
202
- },
203
- {
204
- "cell_type": "markdown",
205
- "metadata": {},
206
- "source": [
207
- "### Sketch Union Usage"
208
- ]
209
- },
210
- {
211
- "cell_type": "markdown",
212
- "metadata": {},
213
- "source": [
214
- "Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data."
215
- ]
216
- },
217
- {
218
- "cell_type": "code",
219
- "execution_count": 9,
220
- "metadata": {},
221
- "outputs": [],
222
- "source": [
223
- "k = 12\n",
224
- "n = 1 << 20\n",
225
- "offset = int(3 * n / 4)"
226
- ]
227
- },
228
- {
229
- "cell_type": "code",
230
- "execution_count": 10,
231
- "metadata": {},
232
- "outputs": [],
233
- "source": [
234
- "sk1 = cpc_sketch(k)\n",
235
- "sk2 = cpc_sketch(k + 1)\n",
236
- "for i in range(0, n):\n",
237
- " sk1.update(i)\n",
238
- " sk2.update(i + offset)"
239
- ]
240
- },
241
- {
242
- "cell_type": "markdown",
243
- "metadata": {},
244
- "source": [
245
- "Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here."
246
- ]
247
- },
248
- {
249
- "cell_type": "code",
250
- "execution_count": 11,
251
- "metadata": {},
252
- "outputs": [],
253
- "source": [
254
- "union = cpc_union(k+1)\n",
255
- "union.update(sk1)\n",
256
- "union.update(sk2)"
257
- ]
258
- },
259
- {
260
- "cell_type": "markdown",
261
- "metadata": {},
262
- "source": [
263
- "Note how log config k has automatically adopted the value of the smaller input sketch."
264
- ]
265
- },
266
- {
267
- "cell_type": "code",
268
- "execution_count": 12,
269
- "metadata": {},
270
- "outputs": [
271
- {
272
- "name": "stdout",
273
- "output_type": "stream",
274
- "text": [
275
- "### CPC sketch summary:\n",
276
- " lgK : 12\n",
277
- " seed hash : 93cc\n",
278
- " C : 37418\n",
279
- " flavor : 4\n",
280
- " merged : true\n",
281
- " compressed : false\n",
282
- " intresting col : 5\n",
283
- " HIP estimate : 0\n",
284
- " kxp : 4096\n",
285
- " offset : 6\n",
286
- " table : allocated\n",
287
- " num SV : 123\n",
288
- " window : allocated\n",
289
- "### End sketch summary\n",
290
- "\n"
291
- ]
292
- }
293
- ],
294
- "source": [
295
- "result = union.get_result()\n",
296
- "print(result)"
297
- ]
298
- },
299
- {
300
- "cell_type": "markdown",
301
- "metadata": {},
302
- "source": [
303
- "We can again compare against the exact result, in this case 1.75*n"
304
- ]
305
- },
306
- {
307
- "cell_type": "code",
308
- "execution_count": 13,
309
- "metadata": {},
310
- "outputs": [
311
- {
312
- "name": "stdout",
313
- "output_type": "stream",
314
- "text": [
315
- "Estimate as % of true value: 99.6646\n"
316
- ]
317
- }
318
- ],
319
- "source": [
320
- "print(\"Estimate as % of true value: \", round(100*result.get_estimate() / (7*n/4), 4))"
321
- ]
322
- }
323
- ],
324
- "metadata": {
325
- "kernelspec": {
326
- "display_name": "Python 3",
327
- "language": "python",
328
- "name": "python3"
329
- },
330
- "language_info": {
331
- "codemirror_mode": {
332
- "name": "ipython",
333
- "version": 3
334
- },
335
- "file_extension": ".py",
336
- "mimetype": "text/x-python",
337
- "name": "python",
338
- "nbconvert_exporter": "python",
339
- "pygments_lexer": "ipython3",
340
- "version": "3.7.0"
341
- }
342
- },
343
- "nbformat": 4,
344
- "nbformat_minor": 2
345
- }