datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,95 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include <pybind11/pybind11.h>
21
- #include <pybind11/stl.h>
22
- #include <pybind11/numpy.h>
23
- #include <vector>
24
-
25
- #include "kernel_function.hpp"
26
- #include "density_sketch.hpp"
27
-
28
- namespace py = pybind11;
29
-
30
- template<typename T, typename K>
31
- void bind_density_sketch(py::module &m, const char* name) {
32
- using namespace datasketches;
33
-
34
- py::class_<density_sketch<T, K>>(m, name)
35
- .def(
36
- py::init([](uint16_t k, uint32_t dim, std::shared_ptr<kernel_function> kernel) {
37
- kernel_function_holder holder(kernel);
38
- return density_sketch<T, K>(k, dim, holder);
39
- }),
40
- py::arg("k"), py::arg("dim"), py::arg("kernel"))
41
- .def("update", static_cast<void (density_sketch<T, K>::*)(const std::vector<T>&)>(&density_sketch<T, K>::update),
42
- "Updates the sketch with the given vector")
43
- .def("merge", static_cast<void (density_sketch<T, K>::*)(const density_sketch<T, K>&)>(&density_sketch<T, K>::merge), py::arg("sketch"),
44
- "Merges the provided sketch into this one")
45
- .def("is_empty", &density_sketch<T, K>::is_empty,
46
- "Returns True if the sketch is empty, otherwise False")
47
- .def("get_k", &density_sketch<T, K>::get_k,
48
- "Returns the configured parameter k")
49
- .def("get_dim", &density_sketch<T, K>::get_dim,
50
- "Returns the configured parameter dim")
51
- .def("get_n", &density_sketch<T, K>::get_n,
52
- "Returns the length of the input stream")
53
- .def("get_num_retained", &density_sketch<T, K>::get_num_retained,
54
- "Returns the number of retained items (samples) in the sketch")
55
- .def("is_estimation_mode", &density_sketch<T, K>::is_estimation_mode,
56
- "Returns True if the sketch is in estimation mode, otherwise False")
57
- .def("get_estimate", &density_sketch<T, K>::get_estimate, py::arg("point"),
58
- "Returns an approximate density at the given point")
59
- .def("__str__", &density_sketch<T, K>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
60
- "Produces a string summary of the sketch")
61
- .def("to_string", &density_sketch<T, K>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
62
- "Produces a string summary of the sketch")
63
- .def("__iter__", [](const density_sketch<T, K>& s){ return py::make_iterator(s.begin(), s.end()); })
64
- .def("serialize",
65
- [](const density_sketch<T, K>& sk) {
66
- auto bytes = sk.serialize();
67
- return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
68
- },
69
- "Serializes the sketch into a bytes object"
70
- )
71
- .def_static(
72
- "deserialize",
73
- [](const std::string& bytes, std::shared_ptr<kernel_function> kernel) {
74
- kernel_function_holder holder(kernel);
75
- return density_sketch<T, K>::deserialize(bytes.data(), bytes.size(), holder);
76
- },
77
- py::arg("bytes"), py::arg("kernel"),
78
- "Reads a bytes object and returns the corresponding density_sketch"
79
- );;
80
- }
81
-
82
- void init_density(py::module &m) {
83
- using namespace datasketches;
84
-
85
- // generic kernel function
86
- py::class_<kernel_function, KernelFunction, std::shared_ptr<kernel_function>>(m, "KernelFunction")
87
- .def(py::init())
88
- .def("__call__", &kernel_function::operator(), py::arg("a"), py::arg("b"))
89
- ;
90
-
91
- // the old sketch names can almost be defined, but the kernel_function_holder won't work in init()
92
- //bind_density_sketch<float, gaussian_kernel<float>>(m, "density_floats_sketch");
93
- //bind_density_sketch<double, gaussian_kernel<double>>(m, "density_doubles_sketch");
94
- bind_density_sketch<double, kernel_function_holder>(m, "_density_sketch");
95
- }
@@ -1,182 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
-
21
- #include "py_serde.hpp"
22
- #include "py_object_ostream.hpp"
23
- #include "frequent_items_sketch.hpp"
24
-
25
- #include <pybind11/pybind11.h>
26
-
27
- #include <ostream>
28
-
29
- namespace py = pybind11;
30
-
31
- // forward declarations
32
- // std::string and arithmetic types, where we don't need a separate serde
33
- template<typename T, typename W, typename H, typename E, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type = 0>
34
- void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz);
35
-
36
- // py::object and other types where the caller must provide a serde
37
- template<typename T, typename W, typename H, typename E, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type = 0>
38
- void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz);
39
-
40
- template<typename T, typename W, typename H, typename E>
41
- void bind_fi_sketch(py::module &m, const char* name) {
42
- using namespace datasketches;
43
-
44
- auto fi_class = py::class_<frequent_items_sketch<T, W, H, E>>(m, name)
45
- .def(py::init<uint8_t>(), py::arg("lg_max_k"))
46
- .def("__str__", &frequent_items_sketch<T, W, H, E>::to_string, py::arg("print_items")=false,
47
- "Produces a string summary of the sketch")
48
- .def("to_string", &frequent_items_sketch<T, W, H, E>::to_string, py::arg("print_items")=false,
49
- "Produces a string summary of the sketch")
50
- .def("update", (void (frequent_items_sketch<T, W, H, E>::*)(const T&, uint64_t)) &frequent_items_sketch<T, W, H, E>::update, py::arg("item"), py::arg("weight")=1,
51
- "Updates the sketch with the given string and, optionally, a weight")
52
- .def("merge", (void (frequent_items_sketch<T, W, H, E>::*)(const frequent_items_sketch<T, W, H, E>&)) &frequent_items_sketch<T, W, H, E>::merge,
53
- "Merges the given sketch into this one")
54
- .def("is_empty", &frequent_items_sketch<T, W, H, E>::is_empty,
55
- "Returns True if the sketch is empty, otherwise False")
56
- .def("get_num_active_items", &frequent_items_sketch<T, W, H, E>::get_num_active_items,
57
- "Returns the number of active items in the sketch")
58
- .def("get_total_weight", &frequent_items_sketch<T, W, H, E>::get_total_weight,
59
- "Returns the sum of the weights (frequencies) in the stream seen so far by the sketch")
60
- .def("get_estimate", &frequent_items_sketch<T, W, H, E>::get_estimate, py::arg("item"),
61
- "Returns the estimate of the weight (frequency) of the given item.\n"
62
- "Note: The true frequency of a item would be the sum of the counts as a result of the "
63
- "two update functions.")
64
- .def("get_lower_bound", &frequent_items_sketch<T, W, H, E>::get_lower_bound, py::arg("item"),
65
- "Returns the guaranteed lower bound weight (frequency) of the given item.")
66
- .def("get_upper_bound", &frequent_items_sketch<T, W, H, E>::get_upper_bound, py::arg("item"),
67
- "Returns the guaranteed upper bound weight (frequency) of the given item.")
68
- .def("get_sketch_epsilon", (double (frequent_items_sketch<T, W, H, E>::*)(void) const) &frequent_items_sketch<T, W, H, E>::get_epsilon,
69
- "Returns the epsilon value used by the sketch to compute error")
70
- .def(
71
- "get_frequent_items",
72
- [](const frequent_items_sketch<T, W, H, E>& sk, frequent_items_error_type err_type, uint64_t threshold) {
73
- if (threshold == 0) threshold = sk.get_maximum_error();
74
- py::list list;
75
- auto rows = sk.get_frequent_items(err_type, threshold);
76
- for (auto row: rows) {
77
- list.append(py::make_tuple(
78
- row.get_item(),
79
- row.get_estimate(),
80
- row.get_lower_bound(),
81
- row.get_upper_bound())
82
- );
83
- }
84
- return list;
85
- },
86
- py::arg("err_type"), py::arg("threshold")=0
87
- )
88
- .def_static(
89
- "get_epsilon_for_lg_size",
90
- [](uint8_t lg_max_map_size) { return frequent_items_sketch<T, W, H, E>::get_epsilon(lg_max_map_size); },
91
- py::arg("lg_max_map_size"),
92
- "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)"
93
- )
94
- .def_static(
95
- "get_apriori_error",
96
- &frequent_items_sketch<T, W, H, E>::get_apriori_error,
97
- py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
98
- "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight."
99
- );
100
-
101
- // serialization may need a caller-provided serde depending on the sketch type, so
102
- // we use a separate method to handle that appropriately based on type T.
103
- add_serialization(fi_class);
104
- }
105
-
106
- // std::string or arithmetic types, for which we have a built-in serde
107
- template<typename T, typename W, typename H, typename E, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type>
108
- void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz) {
109
- using namespace datasketches;
110
- clazz.def(
111
- "get_serialized_size_bytes",
112
- [](const frequent_items_sketch<T, W, H, E>& sk) { return sk.get_serialized_size_bytes(); },
113
- "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at."
114
- )
115
- .def(
116
- "serialize",
117
- [](const frequent_items_sketch<T, W, H, E>& sk) {
118
- auto bytes = sk.serialize();
119
- return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
120
- },
121
- "Serializes the sketch into a bytes object."
122
- )
123
- .def_static(
124
- "deserialize",
125
- [](const std::string& bytes) { return frequent_items_sketch<T, W, H, E>::deserialize(bytes.data(), bytes.size()); },
126
- py::arg("bytes"),
127
- "Reads a bytes object and returns the corresponding frequent_strings_sketch."
128
- );
129
- }
130
-
131
- // py::object or any other type that requires a provided serde
132
- template<typename T, typename W, typename H, typename E, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type>
133
- void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz) {
134
- using namespace datasketches;
135
- clazz.def(
136
- "get_serialized_size_bytes",
137
- [](const frequent_items_sketch<T, W, H, E>& sk, py_object_serde& serde) { return sk.get_serialized_size_bytes(serde); },
138
- py::arg("serde"),
139
- "Computes the size needed to serialize the current state of the sketch using the provided serde. This can be expensive since every item needs to be looked at."
140
- )
141
- .def(
142
- "serialize",
143
- [](const frequent_items_sketch<T, W, H, E>& sk, py_object_serde& serde) {
144
- auto bytes = sk.serialize(0, serde);
145
- return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
146
- }, py::arg("serde"),
147
- "Serializes the sketch into a bytes object using the provided serde."
148
- )
149
- .def_static(
150
- "deserialize",
151
- [](const std::string& bytes, py_object_serde& serde) {
152
- return frequent_items_sketch<T, W, H, E>::deserialize(bytes.data(), bytes.size(), serde);
153
- }, py::arg("bytes"), py::arg("serde"),
154
- "Reads a bytes object using the provided serde and returns the corresponding frequent_strings_sketch."
155
- );
156
- }
157
-
158
- // calls class __hash__ method
159
- struct py_hash_caller {
160
- size_t operator()(const py::object& a) const {
161
- return py::hash(a);
162
- }
163
- };
164
-
165
- // calls class __eq__ method
166
- struct py_equal_caller {
167
- bool operator()(const py::object& a, const py::object& b) const {
168
- return a.equal(b);
169
- }
170
- };
171
-
172
- void init_fi(py::module &m) {
173
- using namespace datasketches;
174
-
175
- py::enum_<frequent_items_error_type>(m, "frequent_items_error_type")
176
- .value("NO_FALSE_POSITIVES", NO_FALSE_POSITIVES)
177
- .value("NO_FALSE_NEGATIVES", NO_FALSE_NEGATIVES)
178
- .export_values();
179
-
180
- bind_fi_sketch<std::string, uint64_t, std::hash<std::string>, std::equal_to<std::string>>(m, "frequent_strings_sketch");
181
- bind_fi_sketch<py::object, uint64_t, py_hash_caller, py_equal_caller>(m, "frequent_items_sketch");
182
- }
@@ -1,126 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include <pybind11/pybind11.h>
21
-
22
- #include "hll.hpp"
23
-
24
- namespace py = pybind11;
25
-
26
- void init_hll(py::module &m) {
27
- using namespace datasketches;
28
-
29
- py::enum_<target_hll_type>(m, "tgt_hll_type", "Target HLL flavor")
30
- .value("HLL_4", HLL_4)
31
- .value("HLL_6", HLL_6)
32
- .value("HLL_8", HLL_8)
33
- .export_values();
34
-
35
- py::class_<hll_sketch>(m, "hll_sketch")
36
- .def(py::init<uint8_t>(), py::arg("lg_k"))
37
- .def(py::init<uint8_t, target_hll_type>(), py::arg("lg_k"), py::arg("tgt_type"))
38
- .def(py::init<uint8_t, target_hll_type, bool>(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false)
39
- .def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
40
- py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
41
- "Produces a string summary of the sketch")
42
- .def("to_string", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
43
- py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
44
- "Produces a string summary of the sketch")
45
- .def_property_readonly("lg_config_k", &hll_sketch::get_lg_config_k, "Configured lg_k value for the sketch")
46
- .def_property_readonly("tgt_type", &hll_sketch::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode")
47
- .def("get_estimate", &hll_sketch::get_estimate,
48
- "Estimate of the distinct count of the input stream")
49
- .def("get_lower_bound", &hll_sketch::get_lower_bound, py::arg("num_std_devs"),
50
- "Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
51
- .def("get_upper_bound", &hll_sketch::get_upper_bound, py::arg("num_std_devs"),
52
- "Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
53
- .def("is_compact", &hll_sketch::is_compact,
54
- "True if the sketch is compact, otherwise False")
55
- .def("is_empty", &hll_sketch::is_empty,
56
- "True if the sketch is empty, otherwise False")
57
- .def("get_updatable_serialization_bytes", &hll_sketch::get_updatable_serialization_bytes,
58
- "Returns the size of the serialized sketch")
59
- .def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes,
60
- "Returns the size of the serialized sketch when compressing the exception table if HLL_4")
61
- .def("reset", &hll_sketch::reset,
62
- "Resets the sketch to the empty state in coupon collection mode")
63
- .def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"),
64
- "Updates the sketch with the given integral value")
65
- .def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"),
66
- "Updates the sketch with the given floating point value")
67
- .def("update", (void (hll_sketch::*)(const std::string&)) &hll_sketch::update, py::arg("datum"),
68
- "Updates the sketch with the given string value")
69
- .def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes,
70
- py::arg("lg_k"), py::arg("tgt_type"),
71
- "Provides a likely upper bound on serialization size for the given parameters")
72
- .def_static("get_rel_err", &hll_sketch::get_rel_err,
73
- py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
74
- "Returns the a priori relative error bound for the given parameters")
75
- .def(
76
- "serialize_compact",
77
- [](const hll_sketch& sk) {
78
- auto bytes = sk.serialize_compact();
79
- return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
80
- },
81
- "Serializes the sketch into a bytes object, compressing the exception table if HLL_4"
82
- )
83
- .def(
84
- "serialize_updatable",
85
- [](const hll_sketch& sk) {
86
- auto bytes = sk.serialize_updatable();
87
- return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
88
- },
89
- "Serializes the sketch into a bytes object"
90
- )
91
- .def_static(
92
- "deserialize",
93
- [](const std::string& bytes) { return hll_sketch::deserialize(bytes.data(), bytes.size()); },
94
- py::arg("bytes"),
95
- "Reads a bytes object and returns the corresponding hll_sketch"
96
- );
97
-
98
- py::class_<hll_union>(m, "hll_union")
99
- .def(py::init<uint8_t>(), py::arg("lg_max_k"))
100
- .def_property_readonly("lg_config_k", &hll_union::get_lg_config_k, "Configured lg_k value for the union")
101
- .def_property_readonly("tgt_type", &hll_union::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode")
102
- .def("get_estimate", &hll_union::get_estimate,
103
- "Estimate of the distinct count of the input stream")
104
- .def("get_lower_bound", &hll_union::get_lower_bound, py::arg("num_std_devs"),
105
- "Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
106
- .def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"),
107
- "Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
108
- .def("is_empty", &hll_union::is_empty,
109
- "True if the union is empty, otherwise False")
110
- .def("reset", &hll_union::reset,
111
- "Resets the union to the empty state")
112
- .def("get_result", &hll_union::get_result, py::arg("tgt_type")=HLL_4,
113
- "Returns a sketch of the target type representing the current union state")
114
- .def<void (hll_union::*)(const hll_sketch&)>("update", &hll_union::update, py::arg("sketch"),
115
- "Updates the union with the given HLL sketch")
116
- .def<void (hll_union::*)(int64_t)>("update", &hll_union::update, py::arg("datum"),
117
- "Updates the union with the given integral value")
118
- .def<void (hll_union::*)(double)>("update", &hll_union::update, py::arg("datum"),
119
- "Updates the union with the given floating point value")
120
- .def<void (hll_union::*)(const std::string&)>("update", &hll_union::update, py::arg("datum"),
121
- "Updates the union with the given string value")
122
- .def_static("get_rel_err", &hll_union::get_rel_err,
123
- py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
124
- "Returns the a priori relative error bound for the given parameters")
125
- ;
126
- }
@@ -1,158 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include "py_object_lt.hpp"
21
- #include "py_object_ostream.hpp"
22
- #include "quantile_conditional.hpp"
23
- #include "kll_sketch.hpp"
24
-
25
- #include <pybind11/pybind11.h>
26
- #include <pybind11/stl.h>
27
- #include <vector>
28
- #include <stdexcept>
29
-
30
- namespace py = pybind11;
31
-
32
- template<typename T, typename C>
33
- void bind_kll_sketch(py::module &m, const char* name) {
34
- using namespace datasketches;
35
-
36
- auto kll_class = py::class_<kll_sketch<T, C>>(m, name)
37
- .def(py::init<uint16_t>(), py::arg("k")=kll_constants::DEFAULT_K)
38
- .def(py::init<const kll_sketch<T, C>&>())
39
- .def(
40
- "update",
41
- static_cast<void (kll_sketch<T, C>::*)(const T&)>(&kll_sketch<T, C>::update),
42
- py::arg("item"),
43
- "Updates the sketch with the given value"
44
- )
45
- .def("merge", (void (kll_sketch<T, C>::*)(const kll_sketch<T, C>&)) &kll_sketch<T, C>::merge, py::arg("sketch"),
46
- "Merges the provided sketch into this one")
47
- .def("__str__", &kll_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
48
- "Produces a string summary of the sketch")
49
- .def("to_string", &kll_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
50
- "Produces a string summary of the sketch")
51
- .def("is_empty", &kll_sketch<T, C>::is_empty,
52
- "Returns True if the sketch is empty, otherwise False")
53
- .def("get_k", &kll_sketch<T, C>::get_k,
54
- "Returns the configured parameter k")
55
- .def("get_n", &kll_sketch<T, C>::get_n,
56
- "Returns the length of the input stream")
57
- .def("get_num_retained", &kll_sketch<T, C>::get_num_retained,
58
- "Returns the number of retained items (samples) in the sketch")
59
- .def("is_estimation_mode", &kll_sketch<T, C>::is_estimation_mode,
60
- "Returns True if the sketch is in estimation mode, otherwise False")
61
- .def("get_min_value", &kll_sketch<T, C>::get_min_item,
62
- "Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
63
- .def("get_max_value", &kll_sketch<T, C>::get_max_item,
64
- "Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
65
- .def("get_quantile", &kll_sketch<T, C>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
66
- "Returns an approximation to the data value "
67
- "associated with the given normalized rank in a hypothetical sorted "
68
- "version of the input stream so far.\n"
69
- "For kll_floats_sketch: if the sketch is empty this returns nan. "
70
- "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
71
- .def(
72
- "get_quantiles",
73
- [](const kll_sketch<T, C>& sk, const std::vector<double>& ranks, bool inclusive) {
74
- return sk.get_quantiles(ranks.data(), ranks.size(), inclusive);
75
- },
76
- py::arg("ranks"), py::arg("inclusive")=false,
77
- "This returns an array that could have been generated by using get_quantile() for each "
78
- "normalized rank separately.\n"
79
- "If the sketch is empty this returns an empty vector.\n"
80
- "Deprecated. Will be removed in the next major version. Use get_quantile() instead."
81
- )
82
- .def("get_rank", &kll_sketch<T, C>::get_rank, py::arg("value"), py::arg("inclusive")=false,
83
- "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
84
- "The resulting approximation has a probabilistic guarantee that can be obtained from the "
85
- "get_normalized_rank_error(False) function.\n"
86
- "With the parameter inclusive=true the weight of the given value is included into the rank."
87
- "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
88
- "If the sketch is empty this returns nan.")
89
- .def(
90
- "get_pmf",
91
- [](const kll_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
92
- return sk.get_PMF(split_points.data(), split_points.size(), inclusive);
93
- },
94
- py::arg("split_points"), py::arg("inclusive")=false,
95
- "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
96
- "given a set of split points (values).\n"
97
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
98
- "get_normalized_rank_error(True) function.\n"
99
- "If the sketch is empty this returns an empty vector.\n"
100
- "split_points is an array of m unique, monotonically increasing float values "
101
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
102
- "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
103
- "exclusive of the right split point, with the exception that the last interval will include "
104
- "the maximum value.\n"
105
- "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
106
- "inclusive of the right split point.\n"
107
- "It is not necessary to include either the min or max values in these split points."
108
- )
109
- .def(
110
- "get_cdf",
111
- [](const kll_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
112
- return sk.get_CDF(split_points.data(), split_points.size(), inclusive);
113
- },
114
- py::arg("split_points"), py::arg("inclusive")=false,
115
- "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
116
- "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
117
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
118
- "get_normalized_rank_error(True) function.\n"
119
- "If the sketch is empty this returns an empty vector.\n"
120
- "split_points is an array of m unique, monotonically increasing float values "
121
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
122
- "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
123
- "exclusive of the right split point, with the exception that the last interval will include "
124
- "the maximum value.\n"
125
- "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
126
- "inclusive of the right split point.\n"
127
- "It is not necessary to include either the min or max values in these split points."
128
- )
129
- .def(
130
- "normalized_rank_error",
131
- static_cast<double (kll_sketch<T, C>::*)(bool) const>(&kll_sketch<T, C>::get_normalized_rank_error),
132
- py::arg("as_pmf"),
133
- "Gets the normalized rank error for this sketch.\n"
134
- "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
135
- "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
136
- "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials"
137
- )
138
- .def_static(
139
- "get_normalized_rank_error",
140
- [](uint16_t k, bool pmf) { return kll_sketch<T, C>::get_normalized_rank_error(k, pmf); },
141
- py::arg("k"), py::arg("as_pmf"),
142
- "Gets the normalized rank error given parameters k and the pmf flag.\n"
143
- "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
144
- "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
145
- "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials"
146
- )
147
- .def("__iter__", [](const kll_sketch<T, C>& s) { return py::make_iterator(s.begin(), s.end()); });
148
-
149
- add_serialization<T>(kll_class);
150
- add_vector_update<T>(kll_class);
151
- }
152
-
153
- void init_kll(py::module &m) {
154
- bind_kll_sketch<int, std::less<int>>(m, "kll_ints_sketch");
155
- bind_kll_sketch<float, std::less<float>>(m, "kll_floats_sketch");
156
- bind_kll_sketch<double, std::less<double>>(m, "kll_doubles_sketch");
157
- bind_kll_sketch<py::object, py_object_lt>(m, "kll_items_sketch");
158
- }
@@ -1,68 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include "kolmogorov_smirnov.hpp"
21
- #include "kll_sketch.hpp"
22
- #include "quantiles_sketch.hpp"
23
-
24
- #include <pybind11/pybind11.h>
25
-
26
- namespace py = pybind11;
27
-
28
- void init_kolmogorov_smirnov(py::module &m) {
29
- using namespace datasketches;
30
-
31
- m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
32
- "Performs the Kolmogorov-Smirnov Test between kll_ints_sketches.\n"
33
- "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
34
- "this will return false.\n"
35
- "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
36
- "distribution) using the provided p-value, otherwise False.");
37
- m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
38
- "Performs the Kolmogorov-Smirnov Test between kll_floats_sketches.\n"
39
- "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
40
- "this will return false.\n"
41
- "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
42
- "distribution) using the provided p-value, otherwise False.");
43
- m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
44
- "Performs the Kolmogorov-Smirnov Test between kll_doubles_sketches.\n"
45
- "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
46
- "this will return false.\n"
47
- "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
48
- "distribution) using the provided p-value, otherwise False.");
49
-
50
- m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
51
- "Performs the Kolmogorov-Smirnov Test between quantiles_ints_sketches.\n"
52
- "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
53
- "this will return false.\n"
54
- "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
55
- "distribution) using the provided p-value, otherwise False.");
56
- m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
57
- "Performs the Kolmogorov-Smirnov Test between quantiles_floats_sketches.\n"
58
- "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
59
- "this will return false.\n"
60
- "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
61
- "distribution) using the provided p-value, otherwise False.");
62
- m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
63
- "Performs the Kolmogorov-Smirnov Test between quantiles_doubles_sketches.\n"
64
- "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
65
- "this will return false.\n"
66
- "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
67
- "distribution) using the provided p-value, otherwise False.");
68
- }