datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,112 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include <cstring>
21
- #include "memory_operations.hpp"
22
-
23
- #include "py_serde.hpp"
24
-
25
- #include <pybind11/pybind11.h>
26
-
27
- namespace py = pybind11;
28
-
29
- void init_serde(py::module& m) {
30
- using namespace datasketches;
31
- py::class_<py_object_serde, PyObjectSerDe /* <--- trampoline*/>(m, "PyObjectSerDe")
32
- .def(py::init<>())
33
- .def("get_size", &py_object_serde::get_size, py::arg("item"),
34
- "Returns the size in bytes of an item")
35
- .def("to_bytes", &py_object_serde::to_bytes, py::arg("item"),
36
- "Retuns a bytes object with a serialized version of an item")
37
- .def("from_bytes", &py_object_serde::from_bytes, py::arg("data"), py::arg("offset"),
38
- "Reads a bytes object starting from the given offest and returns a tuple of the reconstructed "
39
- "object and the number of additional bytes read")
40
- ;
41
- }
42
-
43
- namespace datasketches {
44
- size_t py_object_serde::size_of_item(const py::object& item) const {
45
- return get_size(item);
46
- }
47
-
48
- size_t py_object_serde::serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const {
49
- size_t bytes_written = 0;
50
- py::gil_scoped_acquire acquire;
51
- for (unsigned i = 0; i < num; ++i) {
52
- std::string bytes = to_bytes(items[i]); // implicit cast from py::bytes
53
- check_memory_size(bytes_written + bytes.size(), capacity);
54
- memcpy(ptr, bytes.c_str(), bytes.size());
55
- ptr = static_cast<char*>(ptr) + bytes.size();
56
- bytes_written += bytes.size();
57
- }
58
- py::gil_scoped_release release;
59
- return bytes_written;
60
- }
61
-
62
- size_t py_object_serde::deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const {
63
- size_t bytes_read = 0;
64
- unsigned i = 0;
65
- bool failure = false;
66
- bool error_from_python = false;
67
- py::gil_scoped_acquire acquire;
68
-
69
- // copy data into bytes only once
70
- py::bytes bytes(static_cast<const char*>(ptr), capacity);
71
- for (; i < num && !failure; ++i) {
72
- py::tuple bytes_and_len;
73
- try {
74
- bytes_and_len = from_bytes(bytes, bytes_read);
75
- } catch (py::error_already_set &e) {
76
- failure = true;
77
- error_from_python = true;
78
- break;
79
- }
80
-
81
- size_t length = py::cast<size_t>(bytes_and_len[1]);
82
- if (bytes_read + length > capacity) {
83
- bytes_read += length; // use this value to report the error
84
- failure = true;
85
- break;
86
- }
87
-
88
- new (&items[i]) py::object(py::cast<py::object>(bytes_and_len[0]));
89
- ptr = static_cast<const char*>(ptr) + length;
90
- bytes_read += length;
91
- }
92
-
93
- if (failure) {
94
- // clean up what we've allocated
95
- for (unsigned j = 0; j < i; ++j) {
96
- items[j].dec_ref();
97
- }
98
-
99
- if (error_from_python) {
100
- throw py::value_error("Error reading value in from_bytes");
101
- } else {
102
- // this next call will throw
103
- check_memory_size(bytes_read, capacity);
104
- }
105
- }
106
-
107
- py::gil_scoped_release release;
108
- return bytes_read;
109
- }
110
-
111
-
112
- } // namespace datasketches
@@ -1,155 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include "py_object_lt.hpp"
21
- #include "py_object_ostream.hpp"
22
- #include "quantile_conditional.hpp"
23
- #include "quantiles_sketch.hpp"
24
-
25
- #include <pybind11/pybind11.h>
26
- #include <pybind11/stl.h>
27
- #include <pybind11/numpy.h>
28
- #include <vector>
29
- #include <stdexcept>
30
-
31
- namespace py = pybind11;
32
-
33
- template<typename T, typename C>
34
- void bind_quantiles_sketch(py::module &m, const char* name) {
35
- using namespace datasketches;
36
-
37
- auto quantiles_class = py::class_<quantiles_sketch<T, C>>(m, name)
38
- .def(py::init<uint16_t>(), py::arg("k")=quantiles_constants::DEFAULT_K)
39
- .def(py::init<const quantiles_sketch<T, C>&>())
40
- .def(
41
- "update",
42
- static_cast<void (quantiles_sketch<T, C>::*)(const T&)>(&quantiles_sketch<T, C>::update),
43
- py::arg("item"),
44
- "Updates the sketch with the given value"
45
- )
46
- .def("merge", (void (quantiles_sketch<T, C>::*)(const quantiles_sketch<T, C>&)) &quantiles_sketch<T, C>::merge, py::arg("sketch"),
47
- "Merges the provided sketch into this one")
48
- .def("__str__", &quantiles_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
49
- "Produces a string summary of the sketch")
50
- .def("to_string", &quantiles_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
51
- "Produces a string summary of the sketch")
52
- .def("is_empty", &quantiles_sketch<T, C>::is_empty,
53
- "Returns True if the sketch is empty, otherwise False")
54
- .def("get_k", &quantiles_sketch<T, C>::get_k,
55
- "Returns the configured parameter k")
56
- .def("get_n", &quantiles_sketch<T, C>::get_n,
57
- "Returns the length of the input stream")
58
- .def("get_num_retained", &quantiles_sketch<T, C>::get_num_retained,
59
- "Returns the number of retained items (samples) in the sketch")
60
- .def("is_estimation_mode", &quantiles_sketch<T, C>::is_estimation_mode,
61
- "Returns True if the sketch is in estimation mode, otherwise False")
62
- .def("get_min_value", &quantiles_sketch<T, C>::get_min_item,
63
- "Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
64
- .def("get_max_value", &quantiles_sketch<T, C>::get_max_item,
65
- "Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
66
- .def("get_quantile", &quantiles_sketch<T, C>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
67
- "Returns an approximation to the data value "
68
- "associated with the given rank in a hypothetical sorted "
69
- "version of the input stream so far.\n"
70
- "For quantiles_floats_sketch: if the sketch is empty this returns nan. "
71
- "For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.")
72
- .def(
73
- "get_quantiles",
74
- [](const quantiles_sketch<T, C>& sk, const std::vector<double>& ranks, bool inclusive) {
75
- return sk.get_quantiles(ranks.data(), ranks.size(), inclusive);
76
- },
77
- py::arg("ranks"), py::arg("inclusive")=false,
78
- "This returns an array that could have been generated by using get_quantile() for each "
79
- "normalized rank separately.\n"
80
- "If the sketch is empty this returns an empty vector.\n"
81
- "Deprecated. Will be removed in the next major version. Use get_quantile() instead."
82
- )
83
- .def("get_rank", &quantiles_sketch<T, C>::get_rank, py::arg("value"), py::arg("inclusive")=false,
84
- "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
85
- "The resulting approximation has a probabilistic guarantee that can be obtained from the "
86
- "get_normalized_rank_error(False) function.\n"
87
- "With the parameter inclusive=true the weight of the given value is included into the rank."
88
- "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
89
- "If the sketch is empty this returns nan.")
90
- .def(
91
- "get_pmf",
92
- [](const quantiles_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
93
- return sk.get_PMF(split_points.data(), split_points.size(), inclusive);
94
- },
95
- py::arg("split_points"), py::arg("inclusive")=false,
96
- "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
97
- "given a set of split points (values).\n"
98
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
99
- "get_normalized_rank_error(True) function.\n"
100
- "If the sketch is empty this returns an empty vector.\n"
101
- "split_points is an array of m unique, monotonically increasing float values "
102
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
103
- "The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
104
- "exclusive of the right split point, with the exception that the last interval will include "
105
- "the maximum value.\n"
106
- "It is not necessary to include either the min or max values in these split points."
107
- )
108
- .def(
109
- "get_cdf",
110
- [](const quantiles_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
111
- return sk.get_CDF(split_points.data(), split_points.size(), inclusive);
112
- },
113
- py::arg("split_points"), py::arg("inclusive")=false,
114
- "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
115
- "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
116
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
117
- "get_normalized_rank_error(True) function.\n"
118
- "If the sketch is empty this returns an empty vector.\n"
119
- "split_points is an array of m unique, monotonically increasing float values "
120
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
121
- "The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
122
- "exclusive of the right split point, with the exception that the last interval will include "
123
- "the maximum value.\n"
124
- "It is not necessary to include either the min or max values in these split points."
125
- )
126
- .def(
127
- "normalized_rank_error",
128
- static_cast<double (quantiles_sketch<T, C>::*)(bool) const>(&quantiles_sketch<T, C>::get_normalized_rank_error),
129
- py::arg("as_pmf"),
130
- "Gets the normalized rank error for this sketch.\n"
131
- "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
132
- "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
133
- "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials"
134
- )
135
- .def_static(
136
- "get_normalized_rank_error",
137
- [](uint16_t k, bool pmf) { return quantiles_sketch<T, C>::get_normalized_rank_error(k, pmf); },
138
- py::arg("k"), py::arg("as_pmf"),
139
- "Gets the normalized rank error given parameters k and the pmf flag.\n"
140
- "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
141
- "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
142
- "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials"
143
- )
144
- .def("__iter__", [](const quantiles_sketch<T, C>& s) { return py::make_iterator(s.begin(), s.end()); });
145
-
146
- add_serialization<T>(quantiles_class);
147
- add_vector_update<T>(quantiles_class);
148
- }
149
-
150
- void init_quantiles(py::module &m) {
151
- bind_quantiles_sketch<int, std::less<int>>(m, "quantiles_ints_sketch");
152
- bind_quantiles_sketch<float, std::less<float>>(m, "quantiles_floats_sketch");
153
- bind_quantiles_sketch<double, std::less<double>>(m, "quantiles_doubles_sketch");
154
- bind_quantiles_sketch<py::object, py_object_lt>(m, "quantiles_items_sketch");
155
- }
@@ -1,154 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include "py_object_lt.hpp"
21
- #include "py_object_ostream.hpp"
22
- #include "quantile_conditional.hpp"
23
- #include "req_sketch.hpp"
24
-
25
- #include <pybind11/pybind11.h>
26
- #include <pybind11/stl.h>
27
- #include <pybind11/numpy.h>
28
- #include <vector>
29
- #include <stdexcept>
30
-
31
- namespace py = pybind11;
32
-
33
- template<typename T, typename C>
34
- void bind_req_sketch(py::module &m, const char* name) {
35
- using namespace datasketches;
36
-
37
- auto req_class = py::class_<req_sketch<T, C>>(m, name)
38
- .def(py::init<uint16_t, bool>(), py::arg("k")=12, py::arg("is_hra")=true)
39
- .def(py::init<const req_sketch<T, C>&>())
40
- .def("update", (void (req_sketch<T, C>::*)(const T&)) &req_sketch<T, C>::update, py::arg("item"),
41
- "Updates the sketch with the given value")
42
- .def("merge", (void (req_sketch<T, C>::*)(const req_sketch<T, C>&)) &req_sketch<T, C>::merge, py::arg("sketch"),
43
- "Merges the provided sketch into this one")
44
- .def("__str__", &req_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
45
- "Produces a string summary of the sketch")
46
- .def("to_string", &req_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
47
- "Produces a string summary of the sketch")
48
- .def("is_hra", &req_sketch<T, C>::is_HRA,
49
- "Returns True if the sketch is in High Rank Accuracy mode, otherwise False")
50
- .def("is_empty", &req_sketch<T, C>::is_empty,
51
- "Returns True if the sketch is empty, otherwise False")
52
- .def("get_k", &req_sketch<T, C>::get_k,
53
- "Returns the configured parameter k")
54
- .def("get_n", &req_sketch<T, C>::get_n,
55
- "Returns the length of the input stream")
56
- .def("get_num_retained", &req_sketch<T, C>::get_num_retained,
57
- "Returns the number of retained items (samples) in the sketch")
58
- .def("is_estimation_mode", &req_sketch<T, C>::is_estimation_mode,
59
- "Returns True if the sketch is in estimation mode, otherwise False")
60
- .def("get_min_value", &req_sketch<T, C>::get_min_item,
61
- "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
62
- .def("get_max_value", &req_sketch<T, C>::get_max_item,
63
- "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
64
- .def("get_quantile", &req_sketch<T, C>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
65
- "Returns an approximation to the data value "
66
- "associated with the given normalized rank in a hypothetical sorted "
67
- "version of the input stream so far.\n"
68
- "For req_floats_sketch: if the sketch is empty this returns nan. "
69
- "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
70
- .def(
71
- "get_quantiles",
72
- [](const req_sketch<T, C>& sk, const std::vector<double>& ranks, bool inclusive) {
73
- return sk.get_quantiles(ranks.data(), ranks.size(), inclusive);
74
- },
75
- py::arg("ranks"), py::arg("inclusive")=false,
76
- "This returns an array that could have been generated by using get_quantile() for each "
77
- "normalized rank separately.\n"
78
- "If the sketch is empty this returns an empty vector.\n"
79
- "Deprecated. Will be removed in the next major version. Use get_quantile() instead."
80
- )
81
- .def("get_rank", &req_sketch<T, C>::get_rank, py::arg("value"), py::arg("inclusive")=false,
82
- "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
83
- "The resulting approximation has a probabilistic guarantee that can be obtained from the "
84
- "get_normalized_rank_error(False) function.\n"
85
- "With the parameter inclusive=true the weight of the given value is included into the rank."
86
- "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
87
- "If the sketch is empty this returns nan.")
88
- .def(
89
- "get_pmf",
90
- [](const req_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
91
- return sk.get_PMF(split_points.data(), split_points.size(), inclusive);
92
- },
93
- py::arg("split_points"), py::arg("inclusive")=false,
94
- "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
95
- "given a set of split points (values).\n"
96
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
97
- "get_normalized_rank_error(True) function.\n"
98
- "If the sketch is empty this returns an empty vector.\n"
99
- "split_points is an array of m unique, monotonically increasing float values "
100
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
101
- "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
102
- "exclusive of the right split point, with the exception that the last interval will include "
103
- "the maximum value.\n"
104
- "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
105
- "inclusive of the right split point.\n"
106
- "It is not necessary to include either the min or max values in these split points."
107
- )
108
- .def(
109
- "get_cdf",
110
- [](const req_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
111
- return sk.get_CDF(split_points.data(), split_points.size(), inclusive);
112
- },
113
- py::arg("split_points"), py::arg("inclusive")=false,
114
- "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
115
- "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
116
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
117
- "get_normalized_rank_error(True) function.\n"
118
- "If the sketch is empty this returns an empty vector.\n"
119
- "split_points is an array of m unique, monotonically increasing float values "
120
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
121
- "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
122
- "exclusive of the right split point, with the exception that the last interval will include "
123
- "the maximum value.\n"
124
- "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
125
- "inclusive of the right split point.\n"
126
- "It is not necessary to include either the min or max values in these split points."
127
- )
128
- .def("get_rank_lower_bound", &req_sketch<T, C>::get_rank_lower_bound, py::arg("rank"), py::arg("num_std_dev"),
129
- "Returns an approximate lower bound on the given normalized rank.\n"
130
- "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
131
- "the number of standard deviations must be 1, 2, or 3.")
132
- .def("get_rank_upper_bound", &req_sketch<T, C>::get_rank_upper_bound, py::arg("rank"), py::arg("num_std_dev"),
133
- "Returns an approximate upper bound on the given normalized rank.\n"
134
- "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
135
- "the number of standard deviations must be 1, 2, or 3.")
136
- .def_static("get_RSE", &req_sketch<T, C>::get_RSE,
137
- py::arg("k"), py::arg("rank"), py::arg("is_hra"), py::arg("n"),
138
- "Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). "
139
- "Derived from Lemma 12 in http://arxiv.org/abs/2004.01668v2, but the constant factors have been "
140
- "modified based on empirical measurements, for a given value of parameter k.\n"
141
- "Normalized rank must be a value between 0.0 and 1.0 (inclusive). If is_hra is True, uses high "
142
- "rank accuracy mode, else low rank accuracy. N is an estimate of the total number of points "
143
- "provided to the sketch.")
144
- .def("__iter__", [](const req_sketch<T, C>& s) { return py::make_iterator(s.begin(), s.end()); });
145
-
146
- add_serialization<T>(req_class);
147
- add_vector_update<T>(req_class);
148
- }
149
-
150
- void init_req(py::module &m) {
151
- bind_req_sketch<int, std::less<int>>(m, "req_ints_sketch");
152
- bind_req_sketch<float, std::less<float>>(m, "req_floats_sketch");
153
- bind_req_sketch<py::object, py_object_lt>(m, "req_items_sketch");
154
- }
@@ -1,166 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #include <pybind11/pybind11.h>
21
- #include <pybind11/stl.h>
22
-
23
- #include "theta_sketch.hpp"
24
- #include "theta_union.hpp"
25
- #include "theta_intersection.hpp"
26
- #include "theta_a_not_b.hpp"
27
- #include "theta_jaccard_similarity.hpp"
28
- #include "common_defs.hpp"
29
-
30
- namespace py = pybind11;
31
-
32
- void init_theta(py::module &m) {
33
- using namespace datasketches;
34
-
35
- py::class_<theta_sketch>(m, "theta_sketch")
36
- .def("__str__", &theta_sketch::to_string, py::arg("print_items")=false,
37
- "Produces a string summary of the sketch")
38
- .def("to_string", &theta_sketch::to_string, py::arg("print_items")=false,
39
- "Produces a string summary of the sketch")
40
- .def("is_empty", &theta_sketch::is_empty,
41
- "Returns True if the sketch is empty, otherwise False")
42
- .def("get_estimate", &theta_sketch::get_estimate,
43
- "Estimate of the distinct count of the input stream")
44
- .def("get_upper_bound", &theta_sketch::get_upper_bound, py::arg("num_std_devs"),
45
- "Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}")
46
- .def("get_lower_bound", &theta_sketch::get_lower_bound, py::arg("num_std_devs"),
47
- "Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}")
48
- .def("is_estimation_mode", &theta_sketch::is_estimation_mode,
49
- "Returns True if sketch is in estimation mode, otherwise False")
50
- .def("get_theta", &theta_sketch::get_theta,
51
- "Returns theta (effective sampling rate) as a fraction from 0 to 1")
52
- .def("get_theta64", &theta_sketch::get_theta64,
53
- "Returns theta as 64-bit value")
54
- .def("get_num_retained", &theta_sketch::get_num_retained,
55
- "Returns the number of items currently in the sketch")
56
- .def("get_seed_hash", &theta_sketch::get_seed_hash,
57
- "Returns a hash of the seed used in the sketch")
58
- .def("is_ordered", &theta_sketch::is_ordered,
59
- "Returns True if the sketch entries are sorted, otherwise False")
60
- .def("__iter__", [](const theta_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
61
- ;
62
-
63
- py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
64
- .def(
65
- py::init([](uint8_t lg_k, double p, uint64_t seed) {
66
- return update_theta_sketch::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build();
67
- }),
68
- py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED
69
- )
70
- .def(py::init<const update_theta_sketch&>())
71
- .def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"),
72
- "Updates the sketch with the given integral value")
73
- .def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"),
74
- "Updates the sketch with the given floating point value")
75
- .def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"),
76
- "Updates the sketch with the given string")
77
- .def("compact", &update_theta_sketch::compact, py::arg("ordered")=true,
78
- "Returns a compacted form of the sketch, optionally sorting it")
79
- ;
80
-
81
- py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
82
- .def(py::init<const compact_theta_sketch&>())
83
- .def(py::init<const theta_sketch&, bool>())
84
- .def(
85
- "serialize",
86
- [](const compact_theta_sketch& sk) {
87
- auto bytes = sk.serialize();
88
- return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
89
- },
90
- "Serializes the sketch into a bytes object"
91
- )
92
- .def_static(
93
- "deserialize",
94
- [](const std::string& bytes, uint64_t seed) {
95
- return compact_theta_sketch::deserialize(bytes.data(), bytes.size(), seed);
96
- },
97
- py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
98
- "Reads a bytes object and returns the corresponding compact_theta_sketch"
99
- );
100
-
101
- py::class_<theta_union>(m, "theta_union")
102
- .def(
103
- py::init([](uint8_t lg_k, double p, uint64_t seed) {
104
- return theta_union::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build();
105
- }),
106
- py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED
107
- )
108
- .def("update", &theta_union::update<const theta_sketch&>, py::arg("sketch"),
109
- "Updates the union with the given sketch")
110
- .def("get_result", &theta_union::get_result, py::arg("ordered")=true,
111
- "Returns the sketch corresponding to the union result")
112
- ;
113
-
114
- py::class_<theta_intersection>(m, "theta_intersection")
115
- .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
116
- .def(py::init<const theta_intersection&>())
117
- .def("update", &theta_intersection::update<const theta_sketch&>, py::arg("sketch"),
118
- "Intersections the provided sketch with the current intersection state")
119
- .def("get_result", &theta_intersection::get_result, py::arg("ordered")=true,
120
- "Returns the sketch corresponding to the intersection result")
121
- .def("has_result", &theta_intersection::has_result,
122
- "Returns True if the intersection has a valid result, otherwise False")
123
- ;
124
-
125
- py::class_<theta_a_not_b>(m, "theta_a_not_b")
126
- .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
127
- .def(
128
- "compute",
129
- &theta_a_not_b::compute<const theta_sketch&, const theta_sketch&>,
130
- py::arg("a"), py::arg("b"), py::arg("ordered")=true,
131
- "Returns a sketch with the result of applying the A-not-B operation on the given inputs"
132
- )
133
- ;
134
-
135
- py::class_<theta_jaccard_similarity>(m, "theta_jaccard_similarity")
136
- .def_static(
137
- "jaccard",
138
- [](const theta_sketch& sketch_a, const theta_sketch& sketch_b, uint64_t seed) {
139
- return theta_jaccard_similarity::jaccard(sketch_a, sketch_b, seed);
140
- },
141
- py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
142
- "Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches"
143
- )
144
- .def_static(
145
- "exactly_equal",
146
- &theta_jaccard_similarity::exactly_equal<const theta_sketch&, const theta_sketch&>,
147
- py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
148
- "Returns True if sketch_a and sketch_b are equivalent, otherwise False"
149
- )
150
- .def_static(
151
- "similarity_test",
152
- &theta_jaccard_similarity::similarity_test<const theta_sketch&, const theta_sketch&>,
153
- py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
154
- "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
155
- "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
156
- "to be similar with a confidence of 97.7% and returns True, otherwise False.")
157
- .def_static(
158
- "dissimilarity_test",
159
- &theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, const theta_sketch&>,
160
- py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
161
- "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
162
- "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
163
- "to be dissimilar with a confidence of 97.7% and returns True, otherwise False."
164
- )
165
- ;
166
- }