datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,490 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include "kll_sketch.hpp"
21
-
22
- #include <pybind11/pybind11.h>
23
- #include <pybind11/stl.h>
24
- #include <pybind11/numpy.h>
25
- #include <sstream>
26
- #include <vector>
27
- #include <stdexcept>
28
-
29
- namespace py = pybind11;
30
-
31
- namespace datasketches {
32
-
33
- namespace vector_of_kll_constants {
34
- static const uint32_t DEFAULT_K = kll_constants::DEFAULT_K;
35
- static const uint32_t DEFAULT_D = 1;
36
- }
37
-
38
- // Wrapper class for Numpy compatibility
39
- template <typename T, typename C = std::less<T>>
40
- class vector_of_kll_sketches {
41
- public:
42
- explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
43
- vector_of_kll_sketches(const vector_of_kll_sketches& other);
44
- vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
45
- vector_of_kll_sketches<T, C>& operator=(const vector_of_kll_sketches& other);
46
- vector_of_kll_sketches<T, C>& operator=(vector_of_kll_sketches&& other);
47
-
48
- // container parameters
49
- inline uint32_t get_k() const;
50
- inline uint32_t get_d() const;
51
-
52
- // sketch updates/merges
53
- void update(const py::array_t<T>& items);
54
- void merge(const vector_of_kll_sketches<T>& other);
55
-
56
- // returns a single sketch combining all data in the array
57
- kll_sketch<T, C> collapse(const py::array_t<int>& isk) const;
58
-
59
- // sketch queries returning an array of results
60
- py::array is_empty() const;
61
- py::array get_n() const;
62
- py::array is_estimation_mode() const;
63
- py::array get_min_values() const;
64
- py::array get_max_values() const;
65
- py::array get_num_retained() const;
66
- py::array get_quantiles(const py::array_t<double>& ranks, const py::array_t<int>& isk) const;
67
- py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
68
- py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
69
- py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
70
-
71
- // human-readable output
72
- std::string to_string(bool print_levels = false, bool print_items = false) const;
73
-
74
- // binary output/input
75
- py::list serialize(const py::array_t<int>& isk);
76
- // note: deserialize() replaces the sketch at the specified
77
- // index. Not a static method.
78
- void deserialize(const py::bytes& sk_bytes, uint32_t idx);
79
-
80
- private:
81
- std::vector<uint32_t> get_indices(const py::array_t<int>& isk) const;
82
-
83
- const uint32_t k_; // kll sketch k parameter
84
- const uint32_t d_; // number of dimensions (here: sketches) to hold
85
- std::vector<kll_sketch<T, C>> sketches_;
86
- };
87
-
88
- template<typename T, typename C>
89
- vector_of_kll_sketches<T, C>::vector_of_kll_sketches(uint32_t k, uint32_t d):
90
- k_(k),
91
- d_(d)
92
- {
93
- // check d is valid (k is checked by kll_sketch)
94
- if (d < 1) {
95
- throw std::invalid_argument("D must be >= 1: " + std::to_string(d));
96
- }
97
-
98
- sketches_.reserve(d);
99
- // spawn the sketches
100
- for (uint32_t i = 0; i < d; i++) {
101
- sketches_.emplace_back(k);
102
- }
103
- }
104
-
105
- template<typename T, typename C>
106
- vector_of_kll_sketches<T, C>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
107
- k_(other.k_),
108
- d_(other.d_),
109
- sketches_(other.sketches_)
110
- {}
111
-
112
- template<typename T, typename C>
113
- vector_of_kll_sketches<T, C>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
114
- k_(other.k_),
115
- d_(other.d_),
116
- sketches_(std::move(other.sketches_))
117
- {}
118
-
119
- template<typename T, typename C>
120
- vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(const vector_of_kll_sketches& other) {
121
- vector_of_kll_sketches<T, C> copy(other);
122
- k_ = copy.k_;
123
- d_ = copy.d_;
124
- std::swap(sketches_, copy.sketches_);
125
- return *this;
126
- }
127
-
128
- template<typename T, typename C>
129
- vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(vector_of_kll_sketches&& other) {
130
- k_ = other.k_;
131
- d_ = other.d_;
132
- std::swap(sketches_, other.sketches_);
133
- return *this;
134
- }
135
-
136
- template<typename T, typename C>
137
- uint32_t vector_of_kll_sketches<T, C>::get_k() const {
138
- return k_;
139
- }
140
-
141
- template<typename T, typename C>
142
- uint32_t vector_of_kll_sketches<T, C>::get_d() const {
143
- return d_;
144
- }
145
-
146
- template<typename T, typename C>
147
- std::vector<uint32_t> vector_of_kll_sketches<T, C>::get_indices(const py::array_t<int>& isk) const {
148
- std::vector<uint32_t> indices;
149
- if (isk.size() == 1) {
150
- auto data = isk.unchecked();
151
- if (data(0) == -1) {
152
- indices.reserve(d_);
153
- for (uint32_t i = 0; i < d_; ++i) {
154
- indices.push_back(i);
155
- }
156
- } else {
157
- indices.push_back(static_cast<uint32_t>(data(0)));
158
- }
159
- } else {
160
- auto data = isk.unchecked<1>();
161
- indices.reserve(isk.size());
162
- for (uint32_t i = 0; i < isk.size(); ++i) {
163
- const uint32_t idx = static_cast<uint32_t>(data(i));
164
- if (idx < d_) {
165
- indices.push_back(idx);
166
- } else {
167
- throw std::invalid_argument("request for invalid dimenions >= d ("
168
- + std::to_string(d_) +"): "+ std::to_string(idx));
169
- }
170
- }
171
- }
172
- return indices;
173
- }
174
-
175
- // Checks if each sketch is empty or not
176
- template<typename T, typename C>
177
- py::array vector_of_kll_sketches<T, C>::is_empty() const {
178
- std::vector<bool> vals(d_);
179
- for (uint32_t i = 0; i < d_; ++i) {
180
- vals[i] = sketches_[i].is_empty();
181
- }
182
-
183
- return py::cast(vals);
184
- }
185
-
186
- // Updates each sketch with values
187
- // Currently: all values must be present
188
- // TODO: allow subsets of sketches to be updated
189
- template<typename T, typename C>
190
- void vector_of_kll_sketches<T, C>::update(const py::array_t<T>& items) {
191
-
192
- size_t ndim = items.ndim();
193
-
194
- if (items.shape(ndim-1) != d_) {
195
- throw std::invalid_argument("input data must have rows with " + std::to_string(d_)
196
- + " elements. Found: " + std::to_string(items.shape(ndim-1)));
197
- }
198
-
199
- if (ndim == 1) {
200
- // 1D case: single value to update per sketch
201
- auto data = items.template unchecked<1>();
202
- for (uint32_t i = 0; i < d_; ++i) {
203
- sketches_[i].update(data(i));
204
- }
205
- }
206
- else if (ndim == 2) {
207
- // 2D case: multiple values to update per sketch
208
- auto data = items.template unchecked<2>();
209
- if (items.flags() & py::array::f_style) {
210
- for (uint32_t j = 0; j < d_; ++j) {
211
- for (uint32_t i = 0; i < items.shape(0); ++i) {
212
- sketches_[j].update(data(i,j));
213
- }
214
- }
215
- } else { // py::array::c_style or py::array::forcecast
216
- for (uint32_t i = 0; i < items.shape(0); ++i) {
217
- for (uint32_t j = 0; j < d_; ++j) {
218
- sketches_[j].update(data(i,j));
219
- }
220
- }
221
- }
222
- }
223
- else {
224
- throw std::invalid_argument("Update input must be 2 or fewer dimensions : " + std::to_string(ndim));
225
- }
226
- }
227
-
228
- // Merges two arrays of sketches
229
- // Currently: all values must be present
230
- template<typename T, typename C>
231
- void vector_of_kll_sketches<T, C>::merge(const vector_of_kll_sketches<T>& other) {
232
- if (d_ != other.get_d()) {
233
- throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
234
- + " vs " + std::to_string(other.d_));
235
- } else {
236
- for (uint32_t i = 0; i < d_; ++i) {
237
- sketches_[i].merge(other.sketches_[i]);
238
- }
239
- }
240
- }
241
-
242
- template<typename T, typename C>
243
- kll_sketch<T, C> vector_of_kll_sketches<T, C>::collapse(const py::array_t<int>& isk) const {
244
- std::vector<uint32_t> inds = get_indices(isk);
245
-
246
- kll_sketch<T, C> result(k_);
247
- for (auto& idx : inds) {
248
- result.merge(sketches_[idx]);
249
- }
250
- return result;
251
- }
252
-
253
- // Number of updates for each sketch
254
- template<typename T, typename C>
255
- py::array vector_of_kll_sketches<T, C>::get_n() const {
256
- std::vector<uint64_t> vals(d_);
257
- for (uint32_t i = 0; i < d_; ++i) {
258
- vals[i] = sketches_[i].get_n();
259
- }
260
- return py::cast(vals);
261
- }
262
-
263
- // Number of retained values for each sketch
264
- template<typename T, typename C>
265
- py::array vector_of_kll_sketches<T, C>::get_num_retained() const {
266
- std::vector<uint32_t> vals(d_);
267
- for (uint32_t i = 0; i < d_; ++i) {
268
- vals[i] = sketches_[i].get_num_retained();
269
- }
270
- return py::cast(vals);
271
- }
272
-
273
- // Gets the minimum value of each sketch
274
- // TODO: allow subsets of sketches
275
- template<typename T, typename C>
276
- py::array vector_of_kll_sketches<T, C>::get_min_values() const {
277
- std::vector<T> vals(d_);
278
- for (uint32_t i = 0; i < d_; ++i) {
279
- vals[i] = sketches_[i].get_min_item();
280
- }
281
- return py::cast(vals);
282
- }
283
-
284
- // Gets the maximum value of each sketch
285
- // TODO: allow subsets of sketches
286
- template<typename T, typename C>
287
- py::array vector_of_kll_sketches<T, C>::get_max_values() const {
288
- std::vector<T> vals(d_);
289
- for (uint32_t i = 0; i < d_; ++i) {
290
- vals[i] = sketches_[i].get_max_item();
291
- }
292
- return py::cast(vals);
293
- }
294
-
295
- // Summary of each sketch as one long string
296
- // Users should use .split('\n\n') when calling it to build a list of each
297
- // sketch's summary
298
- template<typename T, typename C>
299
- std::string vector_of_kll_sketches<T, C>::to_string(bool print_levels, bool print_items) const {
300
- std::ostringstream ss;
301
- for (uint32_t i = 0; i < d_; ++i) {
302
- // all streams into 1 string, for compatibility with Python's str() behavior
303
- // users will need to split by \n\n, e.g., str(kll).split('\n\n')
304
- if (i > 0) ss << "\n";
305
- ss << sketches_[i].to_string(print_levels, print_items);
306
- }
307
- return ss.str();
308
- }
309
-
310
- template<typename T, typename C>
311
- py::array vector_of_kll_sketches<T, C>::is_estimation_mode() const {
312
- std::vector<bool> vals(d_);
313
- for (uint32_t i = 0; i < d_; ++i) {
314
- vals[i] = sketches_[i].is_estimation_mode();
315
- }
316
- return py::cast(vals);
317
- }
318
-
319
- // Value of sketch(es) corresponding to some quantile(s)
320
- template<typename T, typename C>
321
- py::array vector_of_kll_sketches<T, C>::get_quantiles(const py::array_t<double>& ranks,
322
- const py::array_t<int>& isk) const {
323
- std::vector<uint32_t> inds = get_indices(isk);
324
- size_t num_sketches = inds.size();
325
- size_t num_quantiles = ranks.size();
326
-
327
- std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
328
- for (uint32_t i = 0; i < num_sketches; ++i) {
329
- for (size_t j = 0; j < num_quantiles; ++j) {
330
- quants[i][j] = sketches_[inds[i]].get_quantile(ranks.data()[j]);
331
- }
332
- }
333
-
334
- return py::cast(quants);
335
- }
336
-
337
- // Value of sketch(es) corresponding to some rank(s)
338
- template<typename T, typename C>
339
- py::array vector_of_kll_sketches<T, C>::get_ranks(const py::array_t<T>& values,
340
- const py::array_t<int>& isk) const {
341
- std::vector<uint32_t> inds = get_indices(isk);
342
- size_t num_sketches = inds.size();
343
- size_t num_ranks = values.size();
344
- auto vals = values.data();
345
-
346
- std::vector<std::vector<float>> ranks(num_sketches, std::vector<float>(num_ranks));
347
- for (uint32_t i = 0; i < num_sketches; ++i) {
348
- for (size_t j = 0; j < num_ranks; ++j) {
349
- ranks[i][j] = sketches_[inds[i]].get_rank(vals[j]);
350
- }
351
- }
352
-
353
- return py::cast(ranks);
354
- }
355
-
356
- // PMF(s) of sketch(es)
357
- template<typename T, typename C>
358
- py::array vector_of_kll_sketches<T, C>::get_pmf(const py::array_t<T>& split_points,
359
- const py::array_t<int>& isk) const {
360
- std::vector<uint32_t> inds = get_indices(isk);
361
- size_t num_sketches = inds.size();
362
- size_t num_splits = split_points.size();
363
-
364
- std::vector<std::vector<T>> pmfs(num_sketches, std::vector<T>(num_splits + 1));
365
- for (uint32_t i = 0; i < num_sketches; ++i) {
366
- auto pmf = sketches_[inds[i]].get_PMF(split_points.data(), num_splits);
367
- for (size_t j = 0; j <= num_splits; ++j) {
368
- pmfs[i][j] = pmf[j];
369
- }
370
- }
371
-
372
- return py::cast(pmfs);
373
- }
374
-
375
- // CDF(s) of sketch(es)
376
- template<typename T, typename C>
377
- py::array vector_of_kll_sketches<T, C>::get_cdf(const py::array_t<T>& split_points,
378
- const py::array_t<int>& isk) const {
379
- std::vector<uint32_t> inds = get_indices(isk);
380
- size_t num_sketches = inds.size();
381
- size_t num_splits = split_points.size();
382
-
383
- std::vector<std::vector<T>> cdfs(num_sketches, std::vector<T>(num_splits + 1));
384
- for (uint32_t i = 0; i < num_sketches; ++i) {
385
- auto cdf = sketches_[inds[i]].get_CDF(split_points.data(), num_splits);
386
- for (size_t j = 0; j <= num_splits; ++j) {
387
- cdfs[i][j] = cdf[j];
388
- }
389
- }
390
-
391
- return py::cast(cdfs);
392
- }
393
-
394
- template<typename T, typename C>
395
- void vector_of_kll_sketches<T, C>::deserialize(const py::bytes& sk_bytes,
396
- uint32_t idx) {
397
- if (idx >= d_) {
398
- throw std::invalid_argument("request for invalid dimenions >= d ("
399
- + std::to_string(d_) +"): "+ std::to_string(idx));
400
- }
401
- std::string skStr = sk_bytes; // implicit cast
402
- // load the sketch into the proper index
403
- sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
404
- }
405
-
406
- template<typename T, typename C>
407
- py::list vector_of_kll_sketches<T, C>::serialize(const py::array_t<int>& isk) {
408
- std::vector<uint32_t> inds = get_indices(isk);
409
- const size_t num_sketches = inds.size();
410
-
411
- py::list list(num_sketches);
412
- for (uint32_t i = 0; i < num_sketches; ++i) {
413
- auto serResult = sketches_[inds[i]].serialize();
414
- list[i] = py::bytes((char*)serResult.data(), serResult.size());
415
- }
416
-
417
- return list;
418
- }
419
-
420
- namespace python {
421
- template<typename T>
422
- double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
423
- return kll_sketch<T>::get_normalized_rank_error(k, pmf);
424
- }
425
-
426
- } // namespace datasketches::python
427
-
428
- } // namespace datasketches
429
-
430
- namespace dspy = datasketches::python;
431
-
432
- template<typename T>
433
- void bind_vector_of_kll_sketches(py::module &m, const char* name) {
434
- using namespace datasketches;
435
-
436
- py::class_<vector_of_kll_sketches<T>>(m, name)
437
- .def(py::init<uint32_t, uint32_t>(), py::arg("k")=vector_of_kll_constants::DEFAULT_K,
438
- py::arg("d")=vector_of_kll_constants::DEFAULT_D)
439
- .def(py::init<const vector_of_kll_sketches<T>&>())
440
- // allow user to retrieve k or d, in case it's instantiated w/ defaults
441
- .def("get_k", &vector_of_kll_sketches<T>::get_k,
442
- "Returns the value of `k` of the sketch(es)")
443
- .def("get_d", &vector_of_kll_sketches<T>::get_d,
444
- "Returns the number of sketches")
445
- .def("update", &vector_of_kll_sketches<T>::update, py::arg("items"),
446
- "Updates the sketch(es) with value(s). Must be a 1D array of size equal to the number of sketches. Can also be 2D array of shape (n_updates, n_sketches). If a sketch does not have a value to update, use np.nan")
447
- .def("__str__", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
448
- "Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
449
- .def("to_string", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false,
450
- py::arg("print_items")=false,
451
- "Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
452
- .def("is_empty", &vector_of_kll_sketches<T>::is_empty,
453
- "Returns whether the sketch(es) is(are) empty of not")
454
- .def("get_n", &vector_of_kll_sketches<T>::get_n,
455
- "Returns the number of values seen by the sketch(es)")
456
- .def("get_num_retained", &vector_of_kll_sketches<T>::get_num_retained,
457
- "Returns the number of values retained by the sketch(es)")
458
- .def("is_estimation_mode", &vector_of_kll_sketches<T>::is_estimation_mode,
459
- "Returns whether the sketch(es) is(are) in estimation mode")
460
- .def("get_min_values", &vector_of_kll_sketches<T>::get_min_values,
461
- "Returns the minimum value(s) of the sketch(es)")
462
- .def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
463
- "Returns the maximum value(s) of the sketch(es)")
464
- .def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("ranks"),
465
- py::arg("isk")=-1,
466
- "Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `ranks` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
467
- .def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
468
- py::arg("isk")=-1,
469
- "Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
470
- .def("get_pmf", &vector_of_kll_sketches<T>::get_pmf, py::arg("split_points"), py::arg("isk")=-1,
471
- "Returns the probability mass function (PMF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the PMF for (default: all sketches)")
472
- .def("get_cdf", &vector_of_kll_sketches<T>::get_cdf, py::arg("split_points"), py::arg("isk")=-1,
473
- "Returns the cumulative distribution function (CDF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the CDF for (default: all sketches)")
474
- .def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error<T>,
475
- py::arg("k"), py::arg("as_pmf"), "Returns the normalized rank error")
476
- .def("serialize", &vector_of_kll_sketches<T>::serialize, py::arg("isk")=-1,
477
- "Serializes the specified sketch(es). `isk` can be an int or a list/array of ints (default: all sketches)")
478
- .def("deserialize", &vector_of_kll_sketches<T>::deserialize, py::arg("skBytes"), py::arg("isk"),
479
- "Deserializes the specified sketch. `isk` must be an int.")
480
- .def("merge", &vector_of_kll_sketches<T>::merge, py::arg("array_of_sketches"),
481
- "Merges the input array of KLL sketches into the existing array.")
482
- .def("collapse", &vector_of_kll_sketches<T>::collapse, py::arg("isk")=-1,
483
- "Returns the result of collapsing all sketches in the array into a single sketch. 'isk' can be an int or a list/array of ints (default: all sketches)")
484
- ;
485
- }
486
-
487
- void init_vector_of_kll(py::module &m) {
488
- bind_vector_of_kll_sketches<int>(m, "vector_of_kll_ints_sketches");
489
- bind_vector_of_kll_sketches<float>(m, "vector_of_kll_floats_sketches");
490
- }
@@ -1,173 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include "var_opt_sketch.hpp"
21
- #include "var_opt_union.hpp"
22
- #include "py_serde.hpp"
23
-
24
- #include <pybind11/pybind11.h>
25
-
26
- namespace py = pybind11;
27
-
28
- namespace datasketches {
29
-
30
- namespace python {
31
-
32
- template<typename T>
33
- var_opt_sketch<T> vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) {
34
- std::string skStr = skBytes; // implicit cast
35
- return var_opt_sketch<T>::deserialize(skStr.c_str(), skStr.length(), sd);
36
- }
37
-
38
- template<typename T>
39
- py::object vo_sketch_serialize(const var_opt_sketch<T>& sk, py_object_serde& sd) {
40
- auto serResult = sk.serialize(0, sd);
41
- return py::bytes((char*)serResult.data(), serResult.size());
42
- }
43
-
44
- template<typename T>
45
- size_t vo_sketch_size_bytes(const var_opt_sketch<T>& sk, py_object_serde& sd) {
46
- return sk.get_serialized_size_bytes(sd);
47
- }
48
-
49
- template<typename T>
50
- var_opt_union<T> vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) {
51
- std::string uStr = uBytes; // implicit cast
52
- return var_opt_union<T>::deserialize(uStr.c_str(), uStr.length(), sd);
53
- }
54
-
55
- template<typename T>
56
- py::object vo_union_serialize(const var_opt_union<T>& u, py_object_serde& sd) {
57
- auto serResult = u.serialize(0, sd);
58
- return py::bytes((char*)serResult.data(), serResult.size());
59
- }
60
-
61
- template<typename T>
62
- size_t vo_union_size_bytes(const var_opt_union<T>& u, py_object_serde& sd) {
63
- return u.get_serialized_size_bytes(sd);
64
- }
65
-
66
- template<typename T>
67
- py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
68
- py::list list;
69
- for (auto item : sk) {
70
- py::tuple t = py::make_tuple(item.first, item.second);
71
- list.append(t);
72
- }
73
- return list;
74
- }
75
-
76
- template<typename T>
77
- py::dict vo_sketch_estimate_subset_sum(const var_opt_sketch<T>& sk, const std::function<bool(T)> func) {
78
- subset_summary summary = sk.estimate_subset_sum(func);
79
- py::dict d;
80
- d["estimate"] = summary.estimate;
81
- d["lower_bound"] = summary.lower_bound;
82
- d["upper_bound"] = summary.upper_bound;
83
- d["total_sketch_weight"] = summary.total_sketch_weight;
84
- return d;
85
- }
86
-
87
- template<typename T>
88
- std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
89
- if (print_items) {
90
- std::ostringstream ss;
91
- ss << sk.to_string();
92
- ss << "### VarOpt Sketch Items" << std::endl;
93
- int i = 0;
94
- for (auto item : sk) {
95
- // item.second is always a double
96
- // item.first is an arbitrary py::object, so get the value by
97
- // using internal str() method then casting to C++ std::string
98
- py::str item_pystr(item.first);
99
- std::string item_str = py::cast<std::string>(item_pystr);
100
- ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
101
- }
102
- return ss.str();
103
- } else {
104
- return sk.to_string();
105
- }
106
- }
107
-
108
- }
109
- }
110
-
111
- namespace dspy = datasketches::python;
112
-
113
- template<typename T>
114
- void bind_vo_sketch(py::module &m, const char* name) {
115
- using namespace datasketches;
116
-
117
- py::class_<var_opt_sketch<T>>(m, name)
118
- .def(py::init<uint32_t>(), py::arg("k"))
119
- .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
120
- "Produces a string summary of the sketch")
121
- .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
122
- "Produces a string summary of the sketch")
123
- .def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0,
124
- "Updates the sketch with the given value and weight")
125
- .def_property_readonly("k", &var_opt_sketch<T>::get_k,
126
- "Returns the sketch's maximum configured sample size")
127
- .def_property_readonly("n", &var_opt_sketch<T>::get_n,
128
- "Returns the total stream length")
129
- .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
130
- "Returns the number of samples currently in the sketch")
131
- .def("get_samples", &dspy::vo_sketch_get_samples<T>,
132
- "Returns the set of samples in the sketch")
133
- .def("is_empty", &var_opt_sketch<T>::is_empty,
134
- "Returns True if the sketch is empty, otherwise False")
135
- .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
136
- "Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
137
- "as upper and lower bounds on the estimate and the total weight processed by the sketch")
138
- .def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes<T>, py::arg("serde"),
139
- "Computes the size in bytes needed to serialize the current sketch")
140
- .def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
141
- .def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
142
- "Constructs a var opt sketch from the given bytes using the provided serde")
143
- .def("__iter__", [](const var_opt_sketch<T>& sk) { return py::make_iterator(sk.begin(), sk.end()); });
144
- }
145
-
146
- template<typename T>
147
- void bind_vo_union(py::module &m, const char* name) {
148
- using namespace datasketches;
149
-
150
- py::class_<var_opt_union<T>>(m, name)
151
- .def(py::init<uint32_t>(), py::arg("max_k"))
152
- .def("__str__", &var_opt_union<T>::to_string,
153
- "Produces a string summary of the sketch")
154
- .def("to_string", &var_opt_union<T>::to_string,
155
- "Produces a string summary of the sketch")
156
- .def("update", (void (var_opt_union<T>::*)(const var_opt_sketch<T>& sk)) &var_opt_union<T>::update, py::arg("sketch"),
157
- "Updates the union with the given sketch")
158
- .def("get_result", &var_opt_union<T>::get_result,
159
- "Returns a sketch corresponding to the union result")
160
- .def("reset", &var_opt_union<T>::reset,
161
- "Resets the union to the empty state")
162
- .def("get_serialized_size_bytes", &dspy::vo_union_size_bytes<T>, py::arg("serde"),
163
- "Computes the size in bytes needed to serialize the current sketch")
164
- .def("serialize", &dspy::vo_union_serialize<T>, py::arg("serde"), "Serialize the var opt union using the provided serde")
165
- .def_static("deserialize", &dspy::vo_union_deserialize<T>, py::arg("bytes"), py::arg("serde"),
166
- "Constructs a var opt union from the given bytes using the provided serde")
167
- ;
168
- }
169
-
170
- void init_vo(py::module &m) {
171
- bind_vo_sketch<py::object>(m, "var_opt_sketch");
172
- bind_vo_union<py::object>(m, "var_opt_union");
173
- }
@@ -1,16 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.