datasketches 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +3 -3
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/README.md +1 -3
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  21. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  23. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  24. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  25. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
  26. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  27. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  28. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  29. data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
  30. data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
  31. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
  32. data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
  33. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  34. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
  35. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  36. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  37. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  38. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  39. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  40. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
  41. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  42. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  43. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  44. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  45. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  46. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  47. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  48. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
  49. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  50. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  51. data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
  52. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
  53. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  54. data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
  55. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  59. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  60. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  63. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  64. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  76. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  77. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
  78. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  79. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  80. data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
  81. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  82. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  83. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  84. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  85. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  86. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  87. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  88. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
  89. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
  90. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  91. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  92. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  93. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
  94. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
  95. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
  96. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  97. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  98. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  99. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
  100. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  101. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
  102. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
  103. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
  104. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  105. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  106. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  107. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  108. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  109. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  110. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  111. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  112. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  113. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
  114. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
  117. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  118. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  119. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  120. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  121. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  122. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  123. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
  124. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  125. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  126. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
  127. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  128. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  129. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  130. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  131. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  132. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  133. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  134. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  135. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  137. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  140. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  141. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  142. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
  143. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  144. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  145. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
  146. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  147. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
  148. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
  149. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
  150. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  151. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  152. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  153. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
  154. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  155. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  157. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  158. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  159. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  160. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
  161. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  162. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  163. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  164. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  165. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  166. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  167. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  168. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  169. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  170. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  171. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  172. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  173. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  174. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  175. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  176. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  177. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  178. data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
  179. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  180. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  181. metadata +61 -79
  182. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  183. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  184. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  185. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  188. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  189. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  190. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  191. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  192. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  193. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
  194. data/vendor/datasketches-cpp/python/README.md +0 -85
  195. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
  196. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  197. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  198. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  199. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  200. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  201. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  202. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  203. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
  204. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
  205. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
  206. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
  207. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  208. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
  209. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
  210. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
  211. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
  212. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  213. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  214. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  215. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  216. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
  217. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
  218. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
  219. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
  220. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
  221. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
  222. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  223. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
  224. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  225. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  230. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  231. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  232. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  233. data/vendor/datasketches-cpp/setup.py +0 -110
  234. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  238. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  239. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  240. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  241. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  242. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  243. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  244. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  245. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,490 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include "kll_sketch.hpp"
21
-
22
- #include <pybind11/pybind11.h>
23
- #include <pybind11/stl.h>
24
- #include <pybind11/numpy.h>
25
- #include <sstream>
26
- #include <vector>
27
- #include <stdexcept>
28
-
29
- namespace py = pybind11;
30
-
31
- namespace datasketches {
32
-
33
- namespace vector_of_kll_constants {
34
- static const uint32_t DEFAULT_K = kll_constants::DEFAULT_K;
35
- static const uint32_t DEFAULT_D = 1;
36
- }
37
-
38
- // Wrapper class for Numpy compatibility
39
- template <typename T, typename C = std::less<T>>
40
- class vector_of_kll_sketches {
41
- public:
42
- explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
43
- vector_of_kll_sketches(const vector_of_kll_sketches& other);
44
- vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
45
- vector_of_kll_sketches<T, C>& operator=(const vector_of_kll_sketches& other);
46
- vector_of_kll_sketches<T, C>& operator=(vector_of_kll_sketches&& other);
47
-
48
- // container parameters
49
- inline uint32_t get_k() const;
50
- inline uint32_t get_d() const;
51
-
52
- // sketch updates/merges
53
- void update(const py::array_t<T>& items);
54
- void merge(const vector_of_kll_sketches<T>& other);
55
-
56
- // returns a single sketch combining all data in the array
57
- kll_sketch<T, C> collapse(const py::array_t<int>& isk) const;
58
-
59
- // sketch queries returning an array of results
60
- py::array is_empty() const;
61
- py::array get_n() const;
62
- py::array is_estimation_mode() const;
63
- py::array get_min_values() const;
64
- py::array get_max_values() const;
65
- py::array get_num_retained() const;
66
- py::array get_quantiles(const py::array_t<double>& ranks, const py::array_t<int>& isk) const;
67
- py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
68
- py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
69
- py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
70
-
71
- // human-readable output
72
- std::string to_string(bool print_levels = false, bool print_items = false) const;
73
-
74
- // binary output/input
75
- py::list serialize(const py::array_t<int>& isk);
76
- // note: deserialize() replaces the sketch at the specified
77
- // index. Not a static method.
78
- void deserialize(const py::bytes& sk_bytes, uint32_t idx);
79
-
80
- private:
81
- std::vector<uint32_t> get_indices(const py::array_t<int>& isk) const;
82
-
83
- const uint32_t k_; // kll sketch k parameter
84
- const uint32_t d_; // number of dimensions (here: sketches) to hold
85
- std::vector<kll_sketch<T, C>> sketches_;
86
- };
87
-
88
- template<typename T, typename C>
89
- vector_of_kll_sketches<T, C>::vector_of_kll_sketches(uint32_t k, uint32_t d):
90
- k_(k),
91
- d_(d)
92
- {
93
- // check d is valid (k is checked by kll_sketch)
94
- if (d < 1) {
95
- throw std::invalid_argument("D must be >= 1: " + std::to_string(d));
96
- }
97
-
98
- sketches_.reserve(d);
99
- // spawn the sketches
100
- for (uint32_t i = 0; i < d; i++) {
101
- sketches_.emplace_back(k);
102
- }
103
- }
104
-
105
- template<typename T, typename C>
106
- vector_of_kll_sketches<T, C>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
107
- k_(other.k_),
108
- d_(other.d_),
109
- sketches_(other.sketches_)
110
- {}
111
-
112
- template<typename T, typename C>
113
- vector_of_kll_sketches<T, C>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
114
- k_(other.k_),
115
- d_(other.d_),
116
- sketches_(std::move(other.sketches_))
117
- {}
118
-
119
- template<typename T, typename C>
120
- vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(const vector_of_kll_sketches& other) {
121
- vector_of_kll_sketches<T, C> copy(other);
122
- k_ = copy.k_;
123
- d_ = copy.d_;
124
- std::swap(sketches_, copy.sketches_);
125
- return *this;
126
- }
127
-
128
- template<typename T, typename C>
129
- vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(vector_of_kll_sketches&& other) {
130
- k_ = other.k_;
131
- d_ = other.d_;
132
- std::swap(sketches_, other.sketches_);
133
- return *this;
134
- }
135
-
136
- template<typename T, typename C>
137
- uint32_t vector_of_kll_sketches<T, C>::get_k() const {
138
- return k_;
139
- }
140
-
141
- template<typename T, typename C>
142
- uint32_t vector_of_kll_sketches<T, C>::get_d() const {
143
- return d_;
144
- }
145
-
146
- template<typename T, typename C>
147
- std::vector<uint32_t> vector_of_kll_sketches<T, C>::get_indices(const py::array_t<int>& isk) const {
148
- std::vector<uint32_t> indices;
149
- if (isk.size() == 1) {
150
- auto data = isk.unchecked();
151
- if (data(0) == -1) {
152
- indices.reserve(d_);
153
- for (uint32_t i = 0; i < d_; ++i) {
154
- indices.push_back(i);
155
- }
156
- } else {
157
- indices.push_back(static_cast<uint32_t>(data(0)));
158
- }
159
- } else {
160
- auto data = isk.unchecked<1>();
161
- indices.reserve(isk.size());
162
- for (uint32_t i = 0; i < isk.size(); ++i) {
163
- const uint32_t idx = static_cast<uint32_t>(data(i));
164
- if (idx < d_) {
165
- indices.push_back(idx);
166
- } else {
167
- throw std::invalid_argument("request for invalid dimenions >= d ("
168
- + std::to_string(d_) +"): "+ std::to_string(idx));
169
- }
170
- }
171
- }
172
- return indices;
173
- }
174
-
175
- // Checks if each sketch is empty or not
176
- template<typename T, typename C>
177
- py::array vector_of_kll_sketches<T, C>::is_empty() const {
178
- std::vector<bool> vals(d_);
179
- for (uint32_t i = 0; i < d_; ++i) {
180
- vals[i] = sketches_[i].is_empty();
181
- }
182
-
183
- return py::cast(vals);
184
- }
185
-
186
- // Updates each sketch with values
187
- // Currently: all values must be present
188
- // TODO: allow subsets of sketches to be updated
189
- template<typename T, typename C>
190
- void vector_of_kll_sketches<T, C>::update(const py::array_t<T>& items) {
191
-
192
- size_t ndim = items.ndim();
193
-
194
- if (items.shape(ndim-1) != d_) {
195
- throw std::invalid_argument("input data must have rows with " + std::to_string(d_)
196
- + " elements. Found: " + std::to_string(items.shape(ndim-1)));
197
- }
198
-
199
- if (ndim == 1) {
200
- // 1D case: single value to update per sketch
201
- auto data = items.template unchecked<1>();
202
- for (uint32_t i = 0; i < d_; ++i) {
203
- sketches_[i].update(data(i));
204
- }
205
- }
206
- else if (ndim == 2) {
207
- // 2D case: multiple values to update per sketch
208
- auto data = items.template unchecked<2>();
209
- if (items.flags() & py::array::f_style) {
210
- for (uint32_t j = 0; j < d_; ++j) {
211
- for (uint32_t i = 0; i < items.shape(0); ++i) {
212
- sketches_[j].update(data(i,j));
213
- }
214
- }
215
- } else { // py::array::c_style or py::array::forcecast
216
- for (uint32_t i = 0; i < items.shape(0); ++i) {
217
- for (uint32_t j = 0; j < d_; ++j) {
218
- sketches_[j].update(data(i,j));
219
- }
220
- }
221
- }
222
- }
223
- else {
224
- throw std::invalid_argument("Update input must be 2 or fewer dimensions : " + std::to_string(ndim));
225
- }
226
- }
227
-
228
- // Merges two arrays of sketches
229
- // Currently: all values must be present
230
- template<typename T, typename C>
231
- void vector_of_kll_sketches<T, C>::merge(const vector_of_kll_sketches<T>& other) {
232
- if (d_ != other.get_d()) {
233
- throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
234
- + " vs " + std::to_string(other.d_));
235
- } else {
236
- for (uint32_t i = 0; i < d_; ++i) {
237
- sketches_[i].merge(other.sketches_[i]);
238
- }
239
- }
240
- }
241
-
242
- template<typename T, typename C>
243
- kll_sketch<T, C> vector_of_kll_sketches<T, C>::collapse(const py::array_t<int>& isk) const {
244
- std::vector<uint32_t> inds = get_indices(isk);
245
-
246
- kll_sketch<T, C> result(k_);
247
- for (auto& idx : inds) {
248
- result.merge(sketches_[idx]);
249
- }
250
- return result;
251
- }
252
-
253
- // Number of updates for each sketch
254
- template<typename T, typename C>
255
- py::array vector_of_kll_sketches<T, C>::get_n() const {
256
- std::vector<uint64_t> vals(d_);
257
- for (uint32_t i = 0; i < d_; ++i) {
258
- vals[i] = sketches_[i].get_n();
259
- }
260
- return py::cast(vals);
261
- }
262
-
263
- // Number of retained values for each sketch
264
- template<typename T, typename C>
265
- py::array vector_of_kll_sketches<T, C>::get_num_retained() const {
266
- std::vector<uint32_t> vals(d_);
267
- for (uint32_t i = 0; i < d_; ++i) {
268
- vals[i] = sketches_[i].get_num_retained();
269
- }
270
- return py::cast(vals);
271
- }
272
-
273
- // Gets the minimum value of each sketch
274
- // TODO: allow subsets of sketches
275
- template<typename T, typename C>
276
- py::array vector_of_kll_sketches<T, C>::get_min_values() const {
277
- std::vector<T> vals(d_);
278
- for (uint32_t i = 0; i < d_; ++i) {
279
- vals[i] = sketches_[i].get_min_item();
280
- }
281
- return py::cast(vals);
282
- }
283
-
284
- // Gets the maximum value of each sketch
285
- // TODO: allow subsets of sketches
286
- template<typename T, typename C>
287
- py::array vector_of_kll_sketches<T, C>::get_max_values() const {
288
- std::vector<T> vals(d_);
289
- for (uint32_t i = 0; i < d_; ++i) {
290
- vals[i] = sketches_[i].get_max_item();
291
- }
292
- return py::cast(vals);
293
- }
294
-
295
- // Summary of each sketch as one long string
296
- // Users should use .split('\n\n') when calling it to build a list of each
297
- // sketch's summary
298
- template<typename T, typename C>
299
- std::string vector_of_kll_sketches<T, C>::to_string(bool print_levels, bool print_items) const {
300
- std::ostringstream ss;
301
- for (uint32_t i = 0; i < d_; ++i) {
302
- // all streams into 1 string, for compatibility with Python's str() behavior
303
- // users will need to split by \n\n, e.g., str(kll).split('\n\n')
304
- if (i > 0) ss << "\n";
305
- ss << sketches_[i].to_string(print_levels, print_items);
306
- }
307
- return ss.str();
308
- }
309
-
310
- template<typename T, typename C>
311
- py::array vector_of_kll_sketches<T, C>::is_estimation_mode() const {
312
- std::vector<bool> vals(d_);
313
- for (uint32_t i = 0; i < d_; ++i) {
314
- vals[i] = sketches_[i].is_estimation_mode();
315
- }
316
- return py::cast(vals);
317
- }
318
-
319
- // Value of sketch(es) corresponding to some quantile(s)
320
- template<typename T, typename C>
321
- py::array vector_of_kll_sketches<T, C>::get_quantiles(const py::array_t<double>& ranks,
322
- const py::array_t<int>& isk) const {
323
- std::vector<uint32_t> inds = get_indices(isk);
324
- size_t num_sketches = inds.size();
325
- size_t num_quantiles = ranks.size();
326
-
327
- std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
328
- for (uint32_t i = 0; i < num_sketches; ++i) {
329
- for (size_t j = 0; j < num_quantiles; ++j) {
330
- quants[i][j] = sketches_[inds[i]].get_quantile(ranks.data()[j]);
331
- }
332
- }
333
-
334
- return py::cast(quants);
335
- }
336
-
337
- // Value of sketch(es) corresponding to some rank(s)
338
- template<typename T, typename C>
339
- py::array vector_of_kll_sketches<T, C>::get_ranks(const py::array_t<T>& values,
340
- const py::array_t<int>& isk) const {
341
- std::vector<uint32_t> inds = get_indices(isk);
342
- size_t num_sketches = inds.size();
343
- size_t num_ranks = values.size();
344
- auto vals = values.data();
345
-
346
- std::vector<std::vector<float>> ranks(num_sketches, std::vector<float>(num_ranks));
347
- for (uint32_t i = 0; i < num_sketches; ++i) {
348
- for (size_t j = 0; j < num_ranks; ++j) {
349
- ranks[i][j] = sketches_[inds[i]].get_rank(vals[j]);
350
- }
351
- }
352
-
353
- return py::cast(ranks);
354
- }
355
-
356
- // PMF(s) of sketch(es)
357
- template<typename T, typename C>
358
- py::array vector_of_kll_sketches<T, C>::get_pmf(const py::array_t<T>& split_points,
359
- const py::array_t<int>& isk) const {
360
- std::vector<uint32_t> inds = get_indices(isk);
361
- size_t num_sketches = inds.size();
362
- size_t num_splits = split_points.size();
363
-
364
- std::vector<std::vector<T>> pmfs(num_sketches, std::vector<T>(num_splits + 1));
365
- for (uint32_t i = 0; i < num_sketches; ++i) {
366
- auto pmf = sketches_[inds[i]].get_PMF(split_points.data(), num_splits);
367
- for (size_t j = 0; j <= num_splits; ++j) {
368
- pmfs[i][j] = pmf[j];
369
- }
370
- }
371
-
372
- return py::cast(pmfs);
373
- }
374
-
375
- // CDF(s) of sketch(es)
376
- template<typename T, typename C>
377
- py::array vector_of_kll_sketches<T, C>::get_cdf(const py::array_t<T>& split_points,
378
- const py::array_t<int>& isk) const {
379
- std::vector<uint32_t> inds = get_indices(isk);
380
- size_t num_sketches = inds.size();
381
- size_t num_splits = split_points.size();
382
-
383
- std::vector<std::vector<T>> cdfs(num_sketches, std::vector<T>(num_splits + 1));
384
- for (uint32_t i = 0; i < num_sketches; ++i) {
385
- auto cdf = sketches_[inds[i]].get_CDF(split_points.data(), num_splits);
386
- for (size_t j = 0; j <= num_splits; ++j) {
387
- cdfs[i][j] = cdf[j];
388
- }
389
- }
390
-
391
- return py::cast(cdfs);
392
- }
393
-
394
- template<typename T, typename C>
395
- void vector_of_kll_sketches<T, C>::deserialize(const py::bytes& sk_bytes,
396
- uint32_t idx) {
397
- if (idx >= d_) {
398
- throw std::invalid_argument("request for invalid dimenions >= d ("
399
- + std::to_string(d_) +"): "+ std::to_string(idx));
400
- }
401
- std::string skStr = sk_bytes; // implicit cast
402
- // load the sketch into the proper index
403
- sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
404
- }
405
-
406
- template<typename T, typename C>
407
- py::list vector_of_kll_sketches<T, C>::serialize(const py::array_t<int>& isk) {
408
- std::vector<uint32_t> inds = get_indices(isk);
409
- const size_t num_sketches = inds.size();
410
-
411
- py::list list(num_sketches);
412
- for (uint32_t i = 0; i < num_sketches; ++i) {
413
- auto serResult = sketches_[inds[i]].serialize();
414
- list[i] = py::bytes((char*)serResult.data(), serResult.size());
415
- }
416
-
417
- return list;
418
- }
419
-
420
- namespace python {
421
- template<typename T>
422
- double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
423
- return kll_sketch<T>::get_normalized_rank_error(k, pmf);
424
- }
425
-
426
- } // namespace datasketches::python
427
-
428
- } // namespace datasketches
429
-
430
- namespace dspy = datasketches::python;
431
-
432
- template<typename T>
433
- void bind_vector_of_kll_sketches(py::module &m, const char* name) {
434
- using namespace datasketches;
435
-
436
- py::class_<vector_of_kll_sketches<T>>(m, name)
437
- .def(py::init<uint32_t, uint32_t>(), py::arg("k")=vector_of_kll_constants::DEFAULT_K,
438
- py::arg("d")=vector_of_kll_constants::DEFAULT_D)
439
- .def(py::init<const vector_of_kll_sketches<T>&>())
440
- // allow user to retrieve k or d, in case it's instantiated w/ defaults
441
- .def("get_k", &vector_of_kll_sketches<T>::get_k,
442
- "Returns the value of `k` of the sketch(es)")
443
- .def("get_d", &vector_of_kll_sketches<T>::get_d,
444
- "Returns the number of sketches")
445
- .def("update", &vector_of_kll_sketches<T>::update, py::arg("items"),
446
- "Updates the sketch(es) with value(s). Must be a 1D array of size equal to the number of sketches. Can also be 2D array of shape (n_updates, n_sketches). If a sketch does not have a value to update, use np.nan")
447
- .def("__str__", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
448
- "Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
449
- .def("to_string", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false,
450
- py::arg("print_items")=false,
451
- "Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
452
- .def("is_empty", &vector_of_kll_sketches<T>::is_empty,
453
- "Returns whether the sketch(es) is(are) empty of not")
454
- .def("get_n", &vector_of_kll_sketches<T>::get_n,
455
- "Returns the number of values seen by the sketch(es)")
456
- .def("get_num_retained", &vector_of_kll_sketches<T>::get_num_retained,
457
- "Returns the number of values retained by the sketch(es)")
458
- .def("is_estimation_mode", &vector_of_kll_sketches<T>::is_estimation_mode,
459
- "Returns whether the sketch(es) is(are) in estimation mode")
460
- .def("get_min_values", &vector_of_kll_sketches<T>::get_min_values,
461
- "Returns the minimum value(s) of the sketch(es)")
462
- .def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
463
- "Returns the maximum value(s) of the sketch(es)")
464
- .def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("ranks"),
465
- py::arg("isk")=-1,
466
- "Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `ranks` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
467
- .def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
468
- py::arg("isk")=-1,
469
- "Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
470
- .def("get_pmf", &vector_of_kll_sketches<T>::get_pmf, py::arg("split_points"), py::arg("isk")=-1,
471
- "Returns the probability mass function (PMF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the PMF for (default: all sketches)")
472
- .def("get_cdf", &vector_of_kll_sketches<T>::get_cdf, py::arg("split_points"), py::arg("isk")=-1,
473
- "Returns the cumulative distribution function (CDF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the CDF for (default: all sketches)")
474
- .def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error<T>,
475
- py::arg("k"), py::arg("as_pmf"), "Returns the normalized rank error")
476
- .def("serialize", &vector_of_kll_sketches<T>::serialize, py::arg("isk")=-1,
477
- "Serializes the specified sketch(es). `isk` can be an int or a list/array of ints (default: all sketches)")
478
- .def("deserialize", &vector_of_kll_sketches<T>::deserialize, py::arg("skBytes"), py::arg("isk"),
479
- "Deserializes the specified sketch. `isk` must be an int.")
480
- .def("merge", &vector_of_kll_sketches<T>::merge, py::arg("array_of_sketches"),
481
- "Merges the input array of KLL sketches into the existing array.")
482
- .def("collapse", &vector_of_kll_sketches<T>::collapse, py::arg("isk")=-1,
483
- "Returns the result of collapsing all sketches in the array into a single sketch. 'isk' can be an int or a list/array of ints (default: all sketches)")
484
- ;
485
- }
486
-
487
- void init_vector_of_kll(py::module &m) {
488
- bind_vector_of_kll_sketches<int>(m, "vector_of_kll_ints_sketches");
489
- bind_vector_of_kll_sketches<float>(m, "vector_of_kll_floats_sketches");
490
- }
@@ -1,173 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include "var_opt_sketch.hpp"
21
- #include "var_opt_union.hpp"
22
- #include "py_serde.hpp"
23
-
24
- #include <pybind11/pybind11.h>
25
-
26
- namespace py = pybind11;
27
-
28
- namespace datasketches {
29
-
30
- namespace python {
31
-
32
- template<typename T>
33
- var_opt_sketch<T> vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) {
34
- std::string skStr = skBytes; // implicit cast
35
- return var_opt_sketch<T>::deserialize(skStr.c_str(), skStr.length(), sd);
36
- }
37
-
38
- template<typename T>
39
- py::object vo_sketch_serialize(const var_opt_sketch<T>& sk, py_object_serde& sd) {
40
- auto serResult = sk.serialize(0, sd);
41
- return py::bytes((char*)serResult.data(), serResult.size());
42
- }
43
-
44
- template<typename T>
45
- size_t vo_sketch_size_bytes(const var_opt_sketch<T>& sk, py_object_serde& sd) {
46
- return sk.get_serialized_size_bytes(sd);
47
- }
48
-
49
- template<typename T>
50
- var_opt_union<T> vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) {
51
- std::string uStr = uBytes; // implicit cast
52
- return var_opt_union<T>::deserialize(uStr.c_str(), uStr.length(), sd);
53
- }
54
-
55
- template<typename T>
56
- py::object vo_union_serialize(const var_opt_union<T>& u, py_object_serde& sd) {
57
- auto serResult = u.serialize(0, sd);
58
- return py::bytes((char*)serResult.data(), serResult.size());
59
- }
60
-
61
- template<typename T>
62
- size_t vo_union_size_bytes(const var_opt_union<T>& u, py_object_serde& sd) {
63
- return u.get_serialized_size_bytes(sd);
64
- }
65
-
66
- template<typename T>
67
- py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
68
- py::list list;
69
- for (auto item : sk) {
70
- py::tuple t = py::make_tuple(item.first, item.second);
71
- list.append(t);
72
- }
73
- return list;
74
- }
75
-
76
- template<typename T>
77
- py::dict vo_sketch_estimate_subset_sum(const var_opt_sketch<T>& sk, const std::function<bool(T)> func) {
78
- subset_summary summary = sk.estimate_subset_sum(func);
79
- py::dict d;
80
- d["estimate"] = summary.estimate;
81
- d["lower_bound"] = summary.lower_bound;
82
- d["upper_bound"] = summary.upper_bound;
83
- d["total_sketch_weight"] = summary.total_sketch_weight;
84
- return d;
85
- }
86
-
87
- template<typename T>
88
- std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
89
- if (print_items) {
90
- std::ostringstream ss;
91
- ss << sk.to_string();
92
- ss << "### VarOpt Sketch Items" << std::endl;
93
- int i = 0;
94
- for (auto item : sk) {
95
- // item.second is always a double
96
- // item.first is an arbitrary py::object, so get the value by
97
- // using internal str() method then casting to C++ std::string
98
- py::str item_pystr(item.first);
99
- std::string item_str = py::cast<std::string>(item_pystr);
100
- ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
101
- }
102
- return ss.str();
103
- } else {
104
- return sk.to_string();
105
- }
106
- }
107
-
108
- }
109
- }
110
-
111
- namespace dspy = datasketches::python;
112
-
113
- template<typename T>
114
- void bind_vo_sketch(py::module &m, const char* name) {
115
- using namespace datasketches;
116
-
117
- py::class_<var_opt_sketch<T>>(m, name)
118
- .def(py::init<uint32_t>(), py::arg("k"))
119
- .def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
120
- "Produces a string summary of the sketch")
121
- .def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
122
- "Produces a string summary of the sketch")
123
- .def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0,
124
- "Updates the sketch with the given value and weight")
125
- .def_property_readonly("k", &var_opt_sketch<T>::get_k,
126
- "Returns the sketch's maximum configured sample size")
127
- .def_property_readonly("n", &var_opt_sketch<T>::get_n,
128
- "Returns the total stream length")
129
- .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
130
- "Returns the number of samples currently in the sketch")
131
- .def("get_samples", &dspy::vo_sketch_get_samples<T>,
132
- "Returns the set of samples in the sketch")
133
- .def("is_empty", &var_opt_sketch<T>::is_empty,
134
- "Returns True if the sketch is empty, otherwise False")
135
- .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
136
- "Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
137
- "as upper and lower bounds on the estimate and the total weight processed by the sketch")
138
- .def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes<T>, py::arg("serde"),
139
- "Computes the size in bytes needed to serialize the current sketch")
140
- .def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
141
- .def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
142
- "Constructs a var opt sketch from the given bytes using the provided serde")
143
- ;
144
- }
145
-
146
- template<typename T>
147
- void bind_vo_union(py::module &m, const char* name) {
148
- using namespace datasketches;
149
-
150
- py::class_<var_opt_union<T>>(m, name)
151
- .def(py::init<uint32_t>(), py::arg("max_k"))
152
- .def("__str__", &var_opt_union<T>::to_string,
153
- "Produces a string summary of the sketch")
154
- .def("to_string", &var_opt_union<T>::to_string,
155
- "Produces a string summary of the sketch")
156
- .def("update", (void (var_opt_union<T>::*)(const var_opt_sketch<T>& sk)) &var_opt_union<T>::update, py::arg("sketch"),
157
- "Updates the union with the given sketch")
158
- .def("get_result", &var_opt_union<T>::get_result,
159
- "Returns a sketch corresponding to the union result")
160
- .def("reset", &var_opt_union<T>::reset,
161
- "Resets the union to the empty state")
162
- .def("get_serialized_size_bytes", &dspy::vo_union_size_bytes<T>, py::arg("serde"),
163
- "Computes the size in bytes needed to serialize the current sketch")
164
- .def("serialize", &dspy::vo_union_serialize<T>, py::arg("serde"), "Serialize the var opt union using the provided serde")
165
- .def_static("deserialize", &dspy::vo_union_deserialize<T>, py::arg("bytes"), py::arg("serde"),
166
- "Constructs a var opt union from the given bytes using the provided serde")
167
- ;
168
- }
169
-
170
- void init_vo(py::module &m) {
171
- bind_vo_sketch<py::object>(m, "var_opt_sketch");
172
- bind_vo_union<py::object>(m, "var_opt_union");
173
- }
@@ -1,16 +0,0 @@
1
- # Licensed to the Apache Software Foundation (ASF) under one
2
- # or more contributor license agreements. See the NOTICE file
3
- # distributed with this work for additional information
4
- # regarding copyright ownership. The ASF licenses this file
5
- # to you under the Apache License, Version 2.0 (the
6
- # "License"); you may not use this file except in compliance
7
- # with the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing,
12
- # software distributed under the License is distributed on an
13
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
- # KIND, either express or implied. See the License for the
15
- # specific language governing permissions and limitations
16
- # under the License.