datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,162 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <sstream>
21
+ #include <pybind11/pybind11.h>
22
+
23
+ #include "theta_sketch.hpp"
24
+ #include "theta_union.hpp"
25
+ #include "theta_intersection.hpp"
26
+ #include "theta_a_not_b.hpp"
27
+ #include "common_defs.hpp"
28
+
29
+
30
+ namespace py = pybind11;
31
+
32
+ namespace datasketches {
33
+ namespace python {
34
+
35
+ update_theta_sketch update_theta_sketch_factory(uint8_t lg_k, double p, uint64_t seed) {
36
+ update_theta_sketch::builder builder;
37
+ builder.set_lg_k(lg_k);
38
+ builder.set_p(p);
39
+ builder.set_seed(seed);
40
+ return builder.build();
41
+ }
42
+
43
+ theta_union theta_union_factory(uint8_t lg_k, double p, uint64_t seed) {
44
+ theta_union::builder builder;
45
+ builder.set_lg_k(lg_k);
46
+ builder.set_p(p);
47
+ builder.set_seed(seed);
48
+ return builder.build();
49
+ }
50
+
51
+ theta_sketch* theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
52
+ std::string skStr = skBytes; // implicit cast
53
+ return theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed).release();
54
+ }
55
+
56
+ py::object theta_sketch_serialize(const theta_sketch& sk) {
57
+ auto serResult = sk.serialize();
58
+ return py::bytes((char*)serResult.data(), serResult.size());
59
+ }
60
+
61
+ uint16_t theta_sketch_get_seed_hash(const theta_sketch& sk) {
62
+ return sk.get_seed_hash();
63
+ }
64
+
65
+ update_theta_sketch update_theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
66
+ std::string skStr = skBytes; // implicit cast
67
+ return update_theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed);
68
+ }
69
+
70
+ compact_theta_sketch compact_theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
71
+ std::string skStr = skBytes; // implicit cast
72
+ return compact_theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed);
73
+ }
74
+
75
+ }
76
+ }
77
+
78
+ namespace dspy = datasketches::python;
79
+
80
+ void init_theta(py::module &m) {
81
+ using namespace datasketches;
82
+
83
+ py::class_<theta_sketch>(m, "theta_sketch")
84
+ .def("serialize", &dspy::theta_sketch_serialize,
85
+ "Serializes the sketch into a bytes object")
86
+ .def_static("deserialize", &dspy::theta_sketch_deserialize, py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
87
+ "Reads a bytes object and returns the corresponding cpc_sketch")
88
+ .def("__str__", &theta_sketch::to_string, py::arg("print_items")=false,
89
+ "Produces a string summary of the sketch")
90
+ .def("to_string", &theta_sketch::to_string, py::arg("print_items")=false,
91
+ "Produces a string summary of the sketch")
92
+ .def("is_empty", &theta_sketch::is_empty,
93
+ "Returns True if the sketch is empty, otherwise Dalse")
94
+ .def("get_estimate", &theta_sketch::get_estimate,
95
+ "Estimate of the distinct count of the input stream")
96
+ .def("get_upper_bound", &theta_sketch::get_upper_bound, py::arg("num_std_devs"),
97
+ "Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}")
98
+ .def("get_lower_bound", &theta_sketch::get_lower_bound, py::arg("num_std_devs"),
99
+ "Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}")
100
+ .def("is_estimation_mode", &theta_sketch::is_estimation_mode,
101
+ "Returns True if sketch is in estimation mode, otherwise False")
102
+ .def("get_theta", &theta_sketch::get_theta,
103
+ "Returns theta (effective sampling rate) as a fraction from 0 to 1")
104
+ .def("get_num_retained", &theta_sketch::get_num_retained,
105
+ "Retunrs the number of items currently in the sketch")
106
+ .def("get_seed_hash", &dspy::theta_sketch_get_seed_hash,
107
+ "Returns a hash of the seed used in the sketch")
108
+ .def("is_ordered", &theta_sketch::is_ordered,
109
+ "Returns True if the sketch entries are sorted, otherwise False")
110
+ ;
111
+
112
+ py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
113
+ .def(py::init(&dspy::update_theta_sketch_factory),
114
+ py::arg("lg_k")=update_theta_sketch::builder::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
115
+ .def(py::init<const update_theta_sketch&>())
116
+ .def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"),
117
+ "Updates the sketch with the given integral value")
118
+ .def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"),
119
+ "Updates the sketch with the given floating point value")
120
+ .def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"),
121
+ "Updates the sketch with the given string")
122
+ .def("compact", &update_theta_sketch::compact, py::arg("ordered")=true,
123
+ "Returns a compacted form of the sketch, optionally sorting it")
124
+ .def_static("deserialize", &dspy::update_theta_sketch_deserialize,
125
+ py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
126
+ "Reads a bytes object and returns the corresponding update_theta_sketch")
127
+ ;
128
+
129
+ py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
130
+ .def(py::init<const compact_theta_sketch&>())
131
+ .def(py::init<const theta_sketch&, bool>())
132
+ .def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
133
+ py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
134
+ "Reads a bytes object and returns the corresponding update_theta_sketch")
135
+ ;
136
+
137
+ py::class_<theta_union>(m, "theta_union")
138
+ .def(py::init(&dspy::theta_union_factory),
139
+ py::arg("lg_k")=update_theta_sketch::builder::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
140
+ .def("update", &theta_union::update, py::arg("sketch"),
141
+ "Updates the union with the given sketch")
142
+ .def("get_result", &theta_union::get_result, py::arg("ordered")=true,
143
+ "Returns the sketch corresponding to the union result")
144
+ ;
145
+
146
+ py::class_<theta_intersection>(m, "theta_intersection")
147
+ .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
148
+ .def(py::init<const theta_intersection&>())
149
+ .def("update", &theta_intersection::update, py::arg("sketch"),
150
+ "Intersections the provided sketch with the current intersection state")
151
+ .def("get_result", &theta_intersection::get_result, py::arg("ordered")=true,
152
+ "Returns the sketch corresponding to the intersection result")
153
+ .def("has_result", &theta_intersection::has_result,
154
+ "Returns True if the intersection has a valid result, otherwisel False")
155
+ ;
156
+
157
+ py::class_<theta_a_not_b>(m, "theta_a_not_b")
158
+ .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
159
+ .def("compute", &theta_a_not_b::compute, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
160
+ "Returns a sketch with the reuslt of appying the A-not-B operation on the given inputs")
161
+ ;
162
+ }
@@ -0,0 +1,488 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include "kll_sketch.hpp"
21
+
22
+ #include <pybind11/pybind11.h>
23
+ #include <pybind11/stl.h>
24
+ #include <pybind11/numpy.h>
25
+ #include <sstream>
26
+ #include <vector>
27
+
28
+ namespace py = pybind11;
29
+
30
+ namespace datasketches {
31
+
32
+ // Wrapper class for Numpy compatibility
33
+ template <typename T, typename C = std::less<T>, typename S = serde<T>>
34
+ class vector_of_kll_sketches {
35
+ public:
36
+ static const uint32_t DEFAULT_K = kll_sketch<T, C, S>::DEFAULT_K;
37
+ static const uint32_t DEFAULT_D = 1;
38
+
39
+ explicit vector_of_kll_sketches(uint32_t k = DEFAULT_K, uint32_t d = DEFAULT_D);
40
+ vector_of_kll_sketches(const vector_of_kll_sketches& other);
41
+ vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
42
+ vector_of_kll_sketches<T,C,S>& operator=(const vector_of_kll_sketches& other);
43
+ vector_of_kll_sketches<T,C,S>& operator=(vector_of_kll_sketches&& other);
44
+
45
+ // container parameters
46
+ inline uint32_t get_k() const;
47
+ inline uint32_t get_d() const;
48
+
49
+ // sketch updates/merges
50
+ void update(const py::array_t<T>& items);
51
+ void merge(const vector_of_kll_sketches<T>& other);
52
+
53
+ // returns a single sketch combining all data in the array
54
+ kll_sketch<T,C,S> collapse(const py::array_t<int>& isk) const;
55
+
56
+ // sketch queries returning an array of results
57
+ py::array is_empty() const;
58
+ py::array get_n() const;
59
+ py::array is_estimation_mode() const;
60
+ py::array get_min_values() const;
61
+ py::array get_max_values() const;
62
+ py::array get_num_retained() const;
63
+ py::array get_quantiles(const py::array_t<double>& fractions, const py::array_t<int>& isk) const;
64
+ py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
65
+ py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
66
+ py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
67
+
68
+ // human-readable output
69
+ std::string to_string(bool print_levels = false, bool print_items = false) const;
70
+
71
+ // binary output/input
72
+ py::list serialize(py::array_t<uint32_t>& isk);
73
+ // note: deserialize() replaces the sketch at the specified
74
+ // index. Not a static method.
75
+ void deserialize(const py::bytes& sk_bytes, uint32_t idx);
76
+
77
+ private:
78
+ std::vector<uint32_t> get_indices(const py::array_t<int>& isk) const;
79
+
80
+ const uint32_t k_; // kll sketch k parameter
81
+ const uint32_t d_; // number of dimensions (here: sketches) to hold
82
+ std::vector<kll_sketch<T,C,S>> sketches_;
83
+ };
84
+
85
+ template<typename T, typename C, typename S>
86
+ vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(uint32_t k, uint32_t d):
87
+ k_(k),
88
+ d_(d)
89
+ {
90
+ // check d is valid (k is checked by kll_sketch)
91
+ if (d < 1) {
92
+ throw std::invalid_argument("D must be >= 1: " + std::to_string(d));
93
+ }
94
+
95
+ sketches_.reserve(d);
96
+ // spawn the sketches
97
+ for (uint32_t i = 0; i < d; i++) {
98
+ sketches_.emplace_back(k);
99
+ }
100
+ }
101
+
102
+ template<typename T, typename C, typename S>
103
+ vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
104
+ k_(other.k_),
105
+ d_(other.d_),
106
+ sketches_(other.sketches_)
107
+ {}
108
+
109
+ template<typename T, typename C, typename S>
110
+ vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
111
+ k_(other.k_),
112
+ d_(other.d_),
113
+ sketches_(std::move(other.sketches_))
114
+ {}
115
+
116
+ template<typename T, typename C, typename S>
117
+ vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(const vector_of_kll_sketches& other) {
118
+ vector_of_kll_sketches<T,C,S> copy(other);
119
+ k_ = copy.k_;
120
+ d_ = copy.d_;
121
+ std::swap(sketches_, copy.sketches_);
122
+ return *this;
123
+ }
124
+
125
+ template<typename T, typename C, typename S>
126
+ vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(vector_of_kll_sketches&& other) {
127
+ k_ = other.k_;
128
+ d_ = other.d_;
129
+ std::swap(sketches_, other.sketches_);
130
+ return *this;
131
+ }
132
+
133
+ template<typename T, typename C, typename S>
134
+ uint32_t vector_of_kll_sketches<T,C,S>::get_k() const {
135
+ return k_;
136
+ }
137
+
138
+ template<typename T, typename C, typename S>
139
+ uint32_t vector_of_kll_sketches<T,C,S>::get_d() const {
140
+ return d_;
141
+ }
142
+
143
+ template<typename T, typename C, typename S>
144
+ std::vector<uint32_t> vector_of_kll_sketches<T,C,S>::get_indices(const py::array_t<int>& isk) const {
145
+ std::vector<uint32_t> indices;
146
+ if (isk.size() == 1) {
147
+ auto data = isk.unchecked();
148
+ if (data(0) == -1) {
149
+ indices.reserve(d_);
150
+ for (uint32_t i = 0; i < d_; ++i) {
151
+ indices.push_back(i);
152
+ }
153
+ } else {
154
+ indices.push_back(static_cast<uint32_t>(data(0)));
155
+ }
156
+ } else {
157
+ auto data = isk.unchecked<1>();
158
+ indices.reserve(isk.size());
159
+ for (uint32_t i = 0; i < isk.size(); ++i) {
160
+ const uint32_t idx = static_cast<uint32_t>(data(i));
161
+ if (idx < d_) {
162
+ indices.push_back(idx);
163
+ } else {
164
+ throw std::invalid_argument("request for invalid dimenions >= d ("
165
+ + std::to_string(d_) +"): "+ std::to_string(idx));
166
+ }
167
+ }
168
+ }
169
+ return indices;
170
+ }
171
+
172
+ // Checks if each sketch is empty or not
173
+ template<typename T, typename C, typename S>
174
+ py::array vector_of_kll_sketches<T,C,S>::is_empty() const {
175
+ std::vector<bool> vals(d_);
176
+ for (uint32_t i = 0; i < d_; ++i) {
177
+ vals[i] = sketches_[i].is_empty();
178
+ }
179
+
180
+ return py::cast(vals);
181
+ }
182
+
183
+ // Updates each sketch with values
184
+ // Currently: all values must be present
185
+ // TODO: allow subsets of sketches to be updated
186
+ template<typename T, typename C, typename S>
187
+ void vector_of_kll_sketches<T,C,S>::update(const py::array_t<T>& items) {
188
+
189
+ size_t ndim = items.ndim();
190
+
191
+ if (items.shape(ndim-1) != d_) {
192
+ throw std::invalid_argument("input data must have rows with " + std::to_string(d_)
193
+ + " elements. Found: " + std::to_string(items.shape(ndim-1)));
194
+ }
195
+
196
+ if (ndim == 1) {
197
+ // 1D case: single value to update per sketch
198
+ auto data = items.template unchecked<1>();
199
+ for (uint32_t i = 0; i < d_; ++i) {
200
+ sketches_[i].update(data(i));
201
+ }
202
+ }
203
+ else if (ndim == 2) {
204
+ // 2D case: multiple values to update per sketch
205
+ auto data = items.template unchecked<2>();
206
+ if (items.flags() & py::array::f_style) {
207
+ for (uint32_t j = 0; j < d_; ++j) {
208
+ for (uint32_t i = 0; i < items.shape(0); ++i) {
209
+ sketches_[j].update(data(i,j));
210
+ }
211
+ }
212
+ } else { // py::array::c_style or py::array::forcecast
213
+ for (uint32_t i = 0; i < items.shape(0); ++i) {
214
+ for (uint32_t j = 0; j < d_; ++j) {
215
+ sketches_[j].update(data(i,j));
216
+ }
217
+ }
218
+ }
219
+ }
220
+ else {
221
+ throw std::invalid_argument("Update input must be 2 or fewer dimensions : " + std::to_string(ndim));
222
+ }
223
+ }
224
+
225
+ // Merges two arrays of sketches
226
+ // Currently: all values must be present
227
+ template<typename T, typename C, typename S>
228
+ void vector_of_kll_sketches<T,C,S>::merge(const vector_of_kll_sketches<T>& other) {
229
+ if (d_ != other.get_d()) {
230
+ throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
231
+ + " vs " + std::to_string(other.d_));
232
+ } else {
233
+ for (uint32_t i = 0; i < d_; ++i) {
234
+ sketches_[i].merge(other.sketches_[i]);
235
+ }
236
+ }
237
+ }
238
+
239
+ template<typename T, typename C, typename S>
240
+ kll_sketch<T,C,S> vector_of_kll_sketches<T,C,S>::collapse(const py::array_t<int>& isk) const {
241
+ std::vector<uint32_t> inds = get_indices(isk);
242
+
243
+ kll_sketch<T,C,S> result(k_);
244
+ for (auto& idx : inds) {
245
+ result.merge(sketches_[idx]);
246
+ }
247
+ return result;
248
+ }
249
+
250
+ // Number of updates for each sketch
251
+ template<typename T, typename C, typename S>
252
+ py::array vector_of_kll_sketches<T,C,S>::get_n() const {
253
+ std::vector<uint64_t> vals(d_);
254
+ for (uint32_t i = 0; i < d_; ++i) {
255
+ vals[i] = sketches_[i].get_n();
256
+ }
257
+ return py::cast(vals);
258
+ }
259
+
260
+ // Number of retained values for each sketch
261
+ template<typename T, typename C, typename S>
262
+ py::array vector_of_kll_sketches<T,C,S>::get_num_retained() const {
263
+ std::vector<uint32_t> vals(d_);
264
+ for (uint32_t i = 0; i < d_; ++i) {
265
+ vals[i] = sketches_[i].get_num_retained();
266
+ }
267
+ return py::cast(vals);
268
+ }
269
+
270
+ // Gets the minimum value of each sketch
271
+ // TODO: allow subsets of sketches
272
+ template<typename T, typename C, typename S>
273
+ py::array vector_of_kll_sketches<T,C,S>::get_min_values() const {
274
+ std::vector<T> vals(d_);
275
+ for (uint32_t i = 0; i < d_; ++i) {
276
+ vals[i] = sketches_[i].get_min_value();
277
+ }
278
+ return py::cast(vals);
279
+ }
280
+
281
+ // Gets the maximum value of each sketch
282
+ // TODO: allow subsets of sketches
283
+ template<typename T, typename C, typename S>
284
+ py::array vector_of_kll_sketches<T,C,S>::get_max_values() const {
285
+ std::vector<T> vals(d_);
286
+ for (uint32_t i = 0; i < d_; ++i) {
287
+ vals[i] = sketches_[i].get_max_value();
288
+ }
289
+ return py::cast(vals);
290
+ }
291
+
292
+ // Summary of each sketch as one long string
293
+ // Users should use .split('\n\n') when calling it to build a list of each
294
+ // sketch's summary
295
+ template<typename T, typename C, typename S>
296
+ std::string vector_of_kll_sketches<T,C,S>::to_string(bool print_levels, bool print_items) const {
297
+ std::ostringstream ss;
298
+ for (uint32_t i = 0; i < d_; ++i) {
299
+ // all streams into 1 string, for compatibility with Python's str() behavior
300
+ // users will need to split by \n\n, e.g., str(kll).split('\n\n')
301
+ if (i > 0) ss << "\n";
302
+ ss << sketches_[i].to_string(print_levels, print_items);
303
+ }
304
+ return ss.str();
305
+ }
306
+
307
+ template<typename T, typename C, typename S>
308
+ py::array vector_of_kll_sketches<T,C,S>::is_estimation_mode() const {
309
+ std::vector<bool> vals(d_);
310
+ for (uint32_t i = 0; i < d_; ++i) {
311
+ vals[i] = sketches_[i].is_estimation_mode();
312
+ }
313
+ return py::cast(vals);
314
+ }
315
+
316
+ // Value of sketch(es) corresponding to some quantile(s)
317
+ template<typename T, typename C, typename S>
318
+ py::array vector_of_kll_sketches<T,C,S>::get_quantiles(const py::array_t<double>& fractions,
319
+ const py::array_t<int>& isk) const {
320
+ std::vector<uint32_t> inds = get_indices(isk);
321
+ size_t num_sketches = inds.size();
322
+ size_t num_quantiles = fractions.size();
323
+
324
+ std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
325
+ for (uint32_t i = 0; i < num_sketches; ++i) {
326
+ auto quant = sketches_[inds[i]].get_quantiles(fractions.data(), num_quantiles);
327
+ for (size_t j = 0; j < num_quantiles; ++j) {
328
+ quants[i][j] = quant[j];
329
+ }
330
+ }
331
+
332
+ return py::cast(quants);
333
+ }
334
+
335
+ // Value of sketch(es) corresponding to some rank(s)
336
+ template<typename T, typename C, typename S>
337
+ py::array vector_of_kll_sketches<T,C,S>::get_ranks(const py::array_t<T>& values,
338
+ const py::array_t<int>& isk) const {
339
+ std::vector<uint32_t> inds = get_indices(isk);
340
+ size_t num_sketches = inds.size();
341
+ size_t num_ranks = values.size();
342
+ auto vals = values.data();
343
+
344
+ std::vector<std::vector<float>> ranks(num_sketches, std::vector<float>(num_ranks));
345
+ for (uint32_t i = 0; i < num_sketches; ++i) {
346
+ for (size_t j = 0; j < num_ranks; ++j) {
347
+ ranks[i][j] = sketches_[inds[i]].get_rank(vals[j]);
348
+ }
349
+ }
350
+
351
+ return py::cast(ranks);
352
+ }
353
+
354
+ // PMF(s) of sketch(es)
355
+ template<typename T, typename C, typename S>
356
+ py::array vector_of_kll_sketches<T,C,S>::get_pmf(const py::array_t<T>& split_points,
357
+ const py::array_t<int>& isk) const {
358
+ std::vector<uint32_t> inds = get_indices(isk);
359
+ size_t num_sketches = inds.size();
360
+ size_t num_splits = split_points.size();
361
+
362
+ std::vector<std::vector<T>> pmfs(num_sketches, std::vector<T>(num_splits + 1));
363
+ for (uint32_t i = 0; i < num_sketches; ++i) {
364
+ auto pmf = sketches_[inds[i]].get_PMF(split_points.data(), num_splits);
365
+ for (size_t j = 0; j <= num_splits; ++j) {
366
+ pmfs[i][j] = pmf[j];
367
+ }
368
+ }
369
+
370
+ return py::cast(pmfs);
371
+ }
372
+
373
+ // CDF(s) of sketch(es)
374
+ template<typename T, typename C, typename S>
375
+ py::array vector_of_kll_sketches<T,C,S>::get_cdf(const py::array_t<T>& split_points,
376
+ const py::array_t<int>& isk) const {
377
+ std::vector<uint32_t> inds = get_indices(isk);
378
+ size_t num_sketches = inds.size();
379
+ size_t num_splits = split_points.size();
380
+
381
+ std::vector<std::vector<T>> cdfs(num_sketches, std::vector<T>(num_splits + 1));
382
+ for (uint32_t i = 0; i < num_sketches; ++i) {
383
+ auto cdf = sketches_[inds[i]].get_CDF(split_points.data(), num_splits);
384
+ for (size_t j = 0; j <= num_splits; ++j) {
385
+ cdfs[i][j] = cdf[j];
386
+ }
387
+ }
388
+
389
+ return py::cast(cdfs);
390
+ }
391
+
392
+ template<typename T, typename C, typename S>
393
+ void vector_of_kll_sketches<T,C,S>::deserialize(const py::bytes& sk_bytes,
394
+ uint32_t idx) {
395
+ if (idx >= d_) {
396
+ throw std::invalid_argument("request for invalid dimenions >= d ("
397
+ + std::to_string(d_) +"): "+ std::to_string(idx));
398
+ }
399
+ std::string skStr = sk_bytes; // implicit cast
400
+ // load the sketch into the proper index
401
+ sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
402
+ }
403
+
404
+ template<typename T, typename C, typename S>
405
+ py::list vector_of_kll_sketches<T,C,S>::serialize(py::array_t<uint32_t>& isk) {
406
+ std::vector<uint32_t> inds = get_indices(isk);
407
+ const size_t num_sketches = inds.size();
408
+
409
+ py::list list(num_sketches);
410
+ for (uint32_t i = 0; i < num_sketches; ++i) {
411
+ auto serResult = sketches_[inds[i]].serialize();
412
+ list[i] = py::bytes((char*)serResult.data(), serResult.size());
413
+ }
414
+
415
+ return list;
416
+ }
417
+
418
+ namespace python {
419
+ template<typename T>
420
+ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
421
+ return kll_sketch<T>::get_normalized_rank_error(k, pmf);
422
+ }
423
+
424
+ } // namespace datasketches::python
425
+
426
+ } // namespace datasketches
427
+
428
+ namespace dspy = datasketches::python;
429
+
430
+ template<typename T>
431
+ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
432
+ using namespace datasketches;
433
+
434
+ py::class_<vector_of_kll_sketches<T>>(m, name)
435
+ .def(py::init<uint32_t, uint32_t>(), py::arg("k")=vector_of_kll_sketches<T>::DEFAULT_K,
436
+ py::arg("d")=vector_of_kll_sketches<T>::DEFAULT_D)
437
+ .def(py::init<const vector_of_kll_sketches<T>&>())
438
+ // allow user to retrieve k or d, in case it's instantiated w/ defaults
439
+ .def("get_k", &vector_of_kll_sketches<T>::get_k,
440
+ "Returns the value of `k` of the sketch(es)")
441
+ .def("get_d", &vector_of_kll_sketches<T>::get_d,
442
+ "Returns the number of sketches")
443
+ .def("update", &vector_of_kll_sketches<T>::update, py::arg("items"),
444
+ "Updates the sketch(es) with value(s). Must be a 1D array of size equal to the number of sketches. Can also be 2D array of shape (n_updates, n_sketches). If a sketch does not have a value to update, use np.nan")
445
+ .def("__str__", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
446
+ "Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
447
+ .def("to_string", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false,
448
+ py::arg("print_items")=false,
449
+ "Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
450
+ .def("is_empty", &vector_of_kll_sketches<T>::is_empty,
451
+ "Returns whether the sketch(es) is(are) empty of not")
452
+ .def("get_n", &vector_of_kll_sketches<T>::get_n,
453
+ "Returns the number of values seen by the sketch(es)")
454
+ .def("get_num_retained", &vector_of_kll_sketches<T>::get_num_retained,
455
+ "Returns the number of values retained by the sketch(es)")
456
+ .def("is_estimation_mode", &vector_of_kll_sketches<T>::is_estimation_mode,
457
+ "Returns whether the sketch(es) is(are) in estimation mode")
458
+ .def("get_min_values", &vector_of_kll_sketches<T>::get_min_values,
459
+ "Returns the minimum value(s) of the sketch(es)")
460
+ .def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
461
+ "Returns the maximum value(s) of the sketch(es)")
462
+ .def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("fractions"),
463
+ py::arg("isk")=-1,
464
+ "Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `fractions` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
465
+ .def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
466
+ py::arg("isk")=-1,
467
+ "Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
468
+ .def("get_pmf", &vector_of_kll_sketches<T>::get_pmf, py::arg("split_points"), py::arg("isk")=-1,
469
+ "Returns the probability mass function (PMF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the PMF for (default: all sketches)")
470
+ .def("get_cdf", &vector_of_kll_sketches<T>::get_cdf, py::arg("split_points"), py::arg("isk")=-1,
471
+ "Returns the cumulative distribution function (CDF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the CDF for (default: all sketches)")
472
+ .def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error<T>,
473
+ py::arg("k"), py::arg("as_pmf"), "Returns the normalized rank error")
474
+ .def("serialize", &vector_of_kll_sketches<T>::serialize, py::arg("isk")=-1,
475
+ "Serializes the specified sketch(es). `isk` can be an int or a list/array of ints (default: all sketches)")
476
+ .def("deserialize", &vector_of_kll_sketches<T>::deserialize, py::arg("skBytes"), py::arg("isk"),
477
+ "Deserializes the specified sketch. `isk` must be an int.")
478
+ .def("merge", &vector_of_kll_sketches<T>::merge, py::arg("array_of_sketches"),
479
+ "Merges the input array of KLL sketches into the existing array.")
480
+ .def("collapse", &vector_of_kll_sketches<T>::collapse, py::arg("isk")=-1,
481
+ "Returns the result of collapsing all sketches in the array into a single sketch. 'isk' can be an int or a list/array of ints (default: all sketches)")
482
+ ;
483
+ }
484
+
485
+ void init_vector_of_kll(py::module &m) {
486
+ bind_vector_of_kll_sketches<int>(m, "vector_of_kll_ints_sketches");
487
+ bind_vector_of_kll_sketches<float>(m, "vector_of_kll_floats_sketches");
488
+ }