datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,104 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _QUANTILE_CONDITIONAL_HPP_
21
+ #define _QUANTILE_CONDITIONAL_HPP_
22
+
23
+ /*
24
+ This header defines conditionally compiled functions shared
25
+ across the set of quantile family sketches.
26
+ */
27
+
28
+ #include "common_defs.hpp"
29
+ #include "py_serde.hpp"
30
+
31
+ #include <pybind11/pybind11.h>
32
+ #include <pybind11/numpy.h>
33
+
34
+ namespace py = pybind11;
35
+
36
+ // Serialization
37
+ // std::string and arithmetic types, where we don't need a separate serde
38
+ template<typename T, typename SK, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type = 0>
39
+ void add_serialization(py::class_<SK>& clazz) {
40
+ clazz.def(
41
+ "serialize",
42
+ [](const SK& sk) {
43
+ auto bytes = sk.serialize();
44
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
45
+ },
46
+ "Serializes the sketch into a bytes object."
47
+ )
48
+ .def_static(
49
+ "deserialize",
50
+ [](const std::string& bytes) { return SK::deserialize(bytes.data(), bytes.size()); },
51
+ py::arg("bytes"),
52
+ "Deserializes the sketch from a bytes object."
53
+ );
54
+ }
55
+
56
+ // py::object and other types where the caller must provide a serde
57
+ template<typename T, typename SK, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type = 0>
58
+ void add_serialization(py::class_<SK>& clazz) {
59
+ clazz.def(
60
+ "serialize",
61
+ [](const SK& sk, datasketches::py_object_serde& serde) {
62
+ auto bytes = sk.serialize(0, serde);
63
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
64
+ }, py::arg("serde"),
65
+ "Serializes the sketch into a bytes object using the provided serde."
66
+ )
67
+ .def_static(
68
+ "deserialize",
69
+ [](const std::string& bytes, datasketches::py_object_serde& serde) {
70
+ return SK::deserialize(bytes.data(), bytes.size(), serde);
71
+ }, py::arg("bytes"), py::arg("serde"),
72
+ "Deserializes the sketch from a bytes object using the provided serde."
73
+ );
74
+ }
75
+
76
+ // Vector Updates
77
+ // * Only allowed for POD types based on numpy restriction, which
78
+ // is equivalent to both std::is_trivial and std::is_standard_layout.
79
+ // * Nothing is added to types that are not PODs.
80
+ // POD type
81
+ template<typename T, typename SK, typename std::enable_if<std::is_trivial<T>::value && std::is_standard_layout<T>::value, bool>::type = 0>
82
+ void add_vector_update(py::class_<SK>& clazz) {
83
+ clazz.def(
84
+ "update",
85
+ [](SK& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
86
+ if (items.ndim() != 1) {
87
+ throw std::invalid_argument("input data must have only one dimension. Found: "
88
+ + std::to_string(items.ndim()));
89
+ }
90
+ auto array = items.template unchecked<1>();
91
+ for (uint32_t i = 0; i < array.size(); ++i) sk.update(array(i));
92
+ },
93
+ py::arg("array"),
94
+ "Updates the sketch with the values in the given array"
95
+ );
96
+ }
97
+
98
+ // non-POD type
99
+ template<typename T, typename SK, typename std::enable_if<!std::is_trivial<T>::value || !std::is_standard_layout<T>::value, bool>::type = 0>
100
+ void add_vector_update(py::class_<SK>& clazz) {
101
+ unused(clazz);
102
+ }
103
+
104
+ #endif // _QUANTILE_CONDITIONAL_HPP_
@@ -0,0 +1,136 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <memory>
21
+ #include <pybind11/pybind11.h>
22
+
23
+ #ifndef _TUPLE_POLICY_HPP_
24
+ #define _TUPLE_POLICY_HPP_
25
+
26
+ namespace py = pybind11;
27
+
28
+ namespace datasketches {
29
+
30
+ /**
31
+ * @brief tuple_policy provides the underlying base class from
32
+ * which native Python policies ultimately inherit. The actual
33
+ * policies implement TuplePolicy, as shown in TuplePolicy.py
34
+ */
35
+ struct tuple_policy {
36
+ virtual py::object create_summary() const = 0;
37
+ virtual py::object update_summary(py::object& summary, const py::object& update) const = 0;
38
+ virtual py::object operator()(py::object& summary, const py::object& update) const = 0;
39
+ virtual ~tuple_policy() = default;
40
+ };
41
+
42
+ /**
43
+ * @brief TuplePolicy provides the "trampoline" class for pybind11
44
+ * that allows for a native Python implementation of tuple
45
+ * sketch policies.
46
+ */
47
+ struct TuplePolicy : public tuple_policy {
48
+ using tuple_policy::tuple_policy;
49
+
50
+ /**
51
+ * @brief Create a summary object
52
+ *
53
+ * @return py::object representing a new summary
54
+ */
55
+ py::object create_summary() const override {
56
+ PYBIND11_OVERRIDE_PURE(
57
+ py::object, // Return type
58
+ tuple_policy, // Parent class
59
+ create_summary, // Name of function in C++ (must match Python name)
60
+ // Argument(s) -- if any
61
+ );
62
+ }
63
+
64
+ /**
65
+ * @brief Update a summary object using this policy
66
+ *
67
+ * @param summary The current summary to update
68
+ * @param update The new value with which to update the summary
69
+ * @return py::object The updated summary
70
+ */
71
+ py::object update_summary(py::object& summary, const py::object& update) const override {
72
+ PYBIND11_OVERRIDE_PURE(
73
+ py::object, // Return type
74
+ tuple_policy, // Parent class
75
+ update_summary, // Name of function in C++ (must match Python name)
76
+ summary, update // Arguments
77
+ );
78
+ }
79
+
80
+ /**
81
+ * @brief Applies this policy to summary with the provided update
82
+ *
83
+ * @param summary The current summary on which to apply the policy
84
+ * @param update An update to apply to the current summary
85
+ * @return py::object The potentially modified summary
86
+ */
87
+ py::object operator()(py::object& summary, const py::object& update) const override {
88
+ PYBIND11_OVERRIDE_PURE_NAME(
89
+ py::object, // Return type
90
+ tuple_policy, // Parent class
91
+ "__call__", // Name of function in python
92
+ operator(), // Name of function in C++
93
+ summary, update // Arguemnts
94
+ );
95
+ }
96
+ };
97
+
98
+ /* The tuple_policy_holder provides a concrete class that dispatches calls
99
+ * from the sketch to the tuple_policy. This class is needed to provide a
100
+ * concrete object to produce a compiled library, but library users should
101
+ * never need to use this directly.
102
+ */
103
+ struct tuple_policy_holder {
104
+ explicit tuple_policy_holder(std::shared_ptr<tuple_policy> policy) : _policy(policy) {}
105
+ tuple_policy_holder(const tuple_policy_holder& other) : _policy(other._policy) {}
106
+ tuple_policy_holder(tuple_policy_holder&& other) : _policy(std::move(other._policy)) {}
107
+ tuple_policy_holder& operator=(const tuple_policy_holder& other) { _policy = other._policy; return *this; }
108
+ tuple_policy_holder& operator=(tuple_policy_holder&& other) { std::swap(_policy, other._policy); return *this; }
109
+
110
+ py::object create() const { return _policy->create_summary(); }
111
+
112
+ void update(py::object& summary, const py::object& update) const {
113
+ summary = _policy->update_summary(summary, update);
114
+ }
115
+
116
+ void operator()(py::object& summary, const py::object& update) const {
117
+ summary = _policy->operator()(summary, update);
118
+ }
119
+
120
+ private:
121
+ std::shared_ptr<tuple_policy> _policy;
122
+ };
123
+
124
+ /* A degenerate policy used to enable Jaccard Similarity on tuple sketches,
125
+ * where the computation requires a union and intersection over the keys but
126
+ * does not need to observe the summaries.
127
+ */
128
+ struct dummy_jaccard_policy {
129
+ void operator()(py::object&, const py::object&) const {
130
+ return;
131
+ }
132
+ };
133
+
134
+ }
135
+
136
+ #endif // _TUPLE_POLICY_HPP_
@@ -0,0 +1,101 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <pybind11/pybind11.h>
21
+
22
+ #include "count_min.hpp"
23
+ #include "common_defs.hpp"
24
+
25
+ namespace py = pybind11;
26
+
27
+ template<typename W>
28
+ void bind_count_min_sketch(py::module &m, const char* name) {
29
+ using namespace datasketches;
30
+
31
+ py::class_<count_min_sketch<W>>(m, name)
32
+ .def(py::init<uint8_t, uint32_t, uint64_t>(), py::arg("num_hashes"), py::arg("num_buckets"), py::arg("seed")=DEFAULT_SEED)
33
+ .def(py::init<const count_min_sketch<W>&>())
34
+ .def_static("suggest_num_buckets", &count_min_sketch<W>::suggest_num_buckets, py::arg("relative_error"),
35
+ "Suggests the number of buckets needed to achieve an accuracy within the provided "
36
+ "relative_error. For example, when relative_error = 0.05, the returned frequency estimates "
37
+ "satisfy the 'relative_error' guarantee that never overestimates the weights but may "
38
+ "underestimate the weights by 5% of the total weight in the sketch. "
39
+ "Returns the number of hash buckets at every level of the sketch required in order to obtain "
40
+ "the specified relative error.")
41
+ .def_static("suggest_num_hashes", &count_min_sketch<W>::suggest_num_hashes, py::arg("confidence"),
42
+ "Suggests the number of hashes needed to achieve the provided confidence. For example, "
43
+ "with 95% confidence, frequency estimates satisfy the 'relative_error' guarantee. "
44
+ "Returns the number of hash functions that are required in order to achieve the specified "
45
+ "confidence of the sketch. confidence = 1 - delta, with delta denoting the sketch failure probability.")
46
+ .def("__str__", &count_min_sketch<W>::to_string,
47
+ "Produces a string summary of the sketch")
48
+ .def("to_string", &count_min_sketch<W>::to_string,
49
+ "Produces a string summary of the sketch")
50
+ .def("is_empty", &count_min_sketch<W>::is_empty,
51
+ "Returns True if the sketch has seen no items, otherwise False")
52
+ .def("get_num_hashes", &count_min_sketch<W>::get_num_hashes,
53
+ "Returns the configured number of hashes for the sketch")
54
+ .def("get_num_buckets", &count_min_sketch<W>::get_num_buckets,
55
+ "Returns the configured number of buckets for the sketch")
56
+ .def("get_seed", &count_min_sketch<W>::get_seed,
57
+ "Returns the base hash seed for the sketch")
58
+ .def("get_relative_error", &count_min_sketch<W>::get_relative_error,
59
+ "Returns the maximum permissible error for any frequency estimate query")
60
+ .def("get_total_weight", &count_min_sketch<W>::get_total_weight,
61
+ "Returns the total weight currently inserted into the stream")
62
+ .def("update", static_cast<void (count_min_sketch<W>::*)(int64_t, W)>(&count_min_sketch<W>::update), py::arg("item"), py::arg("weight")=1.0,
63
+ "Updates the sketch with the given 64-bit integer value")
64
+ .def("update", static_cast<void (count_min_sketch<W>::*)(const std::string&, W)>(&count_min_sketch<W>::update), py::arg("item"), py::arg("weight")=1.0,
65
+ "Updates the sketch with the given string")
66
+ .def("get_estimate", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_estimate), py::arg("item"),
67
+ "Returns an estimate of the frequency of the provided 64-bit integer value")
68
+ .def("get_estimate", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_estimate), py::arg("item"),
69
+ "Returns an estimate of the frequency of the provided string")
70
+ .def("get_upper_bound", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_upper_bound), py::arg("item"),
71
+ "Returns an upper bound on the estimate for the given 64-bit integer value")
72
+ .def("get_upper_bound", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_upper_bound), py::arg("item"),
73
+ "Returns an upper bound on the estimate for the provided string")
74
+ .def("get_lower_bound", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_lower_bound), py::arg("item"),
75
+ "Returns an lower bound on the estimate for the given 64-bit integer value")
76
+ .def("get_lower_bound", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_lower_bound), py::arg("item"),
77
+ "Returns an lower bound on the estimate for the provided string")
78
+ .def("merge", &count_min_sketch<W>::merge, py::arg("other"),
79
+ "Merges the provided other sketch into this one")
80
+ .def("get_serialized_size_bytes", &count_min_sketch<W>::get_serialized_size_bytes,
81
+ "Returns the size in bytes of the serialized image of the sketch")
82
+ .def(
83
+ "serialize",
84
+ [](const count_min_sketch<W>& sk) {
85
+ auto bytes = sk.serialize();
86
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
87
+ },
88
+ "Serializes the sketch into a bytes object"
89
+ )
90
+ .def_static(
91
+ "deserialize",
92
+ [](const std::string& bytes) { return count_min_sketch<W>::deserialize(bytes.data(), bytes.size()); },
93
+ py::arg("bytes"),
94
+ "Reads a bytes object and returns the corresponding count_min_sketch"
95
+ );
96
+ }
97
+
98
+ void init_count_min(py::module &m) {
99
+ bind_count_min_sketch<double>(m, "count_min_sketch");
100
+ }
101
+
@@ -17,7 +17,6 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #include <sstream>
21
20
  #include <pybind11/pybind11.h>
22
21
 
23
22
  #include "cpc_sketch.hpp"
@@ -27,28 +26,6 @@
27
26
 
28
27
  namespace py = pybind11;
29
28
 
30
- namespace datasketches {
31
- namespace python {
32
-
33
- cpc_sketch* cpc_sketch_deserialize(py::bytes skBytes) {
34
- std::string skStr = skBytes; // implicit cast
35
- return new cpc_sketch(cpc_sketch::deserialize(skStr.c_str(), skStr.length()));
36
- }
37
-
38
- py::object cpc_sketch_serialize(const cpc_sketch& sk) {
39
- auto serResult = sk.serialize();
40
- return py::bytes((char*)serResult.data(), serResult.size());
41
- }
42
-
43
- cpc_sketch* cpc_union_get_result(const cpc_union& u) {
44
- return new cpc_sketch(u.get_result());
45
- }
46
-
47
- }
48
- }
49
-
50
- namespace dspy = datasketches::python;
51
-
52
29
  void init_cpc(py::module &m) {
53
30
  using namespace datasketches;
54
31
 
@@ -59,10 +36,6 @@ void init_cpc(py::module &m) {
59
36
  "Produces a string summary of the sketch")
60
37
  .def("to_string", &cpc_sketch::to_string,
61
38
  "Produces a string summary of the sketch")
62
- .def("serialize", &dspy::cpc_sketch_serialize,
63
- "Serializes the sketch into a bytes object")
64
- .def_static("deserialize", &dspy::cpc_sketch_deserialize,
65
- "Reads a bytes object and returns the corresponding cpc_sketch")
66
39
  .def<void (cpc_sketch::*)(uint64_t)>("update", &cpc_sketch::update, py::arg("datum"),
67
40
  "Updates the sketch with the given 64-bit integer value")
68
41
  .def<void (cpc_sketch::*)(double)>("update", &cpc_sketch::update, py::arg("datum"),
@@ -70,21 +43,34 @@ void init_cpc(py::module &m) {
70
43
  .def<void (cpc_sketch::*)(const std::string&)>("update", &cpc_sketch::update, py::arg("datum"),
71
44
  "Updates the sketch with the given string")
72
45
  .def("is_empty", &cpc_sketch::is_empty,
73
- "Returns True if the sketch is empty, otherwise Dalse")
46
+ "Returns True if the sketch is empty, otherwise False")
74
47
  .def("get_estimate", &cpc_sketch::get_estimate,
75
48
  "Estimate of the distinct count of the input stream")
76
49
  .def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"),
77
50
  "Returns an approximate lower bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
78
51
  .def("get_upper_bound", &cpc_sketch::get_upper_bound, py::arg("kappa"),
79
52
  "Returns an approximate upper bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
80
- ;
53
+ .def(
54
+ "serialize",
55
+ [](const cpc_sketch& sk) {
56
+ auto bytes = sk.serialize();
57
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
58
+ },
59
+ "Serializes the sketch into a bytes object"
60
+ )
61
+ .def_static(
62
+ "deserialize",
63
+ [](const std::string& bytes) { return cpc_sketch::deserialize(bytes.data(), bytes.size()); },
64
+ py::arg("bytes"),
65
+ "Reads a bytes object and returns the corresponding cpc_sketch"
66
+ );
81
67
 
82
68
  py::class_<cpc_union>(m, "cpc_union")
83
69
  .def(py::init<uint8_t, uint64_t>(), py::arg("lg_k"), py::arg("seed")=DEFAULT_SEED)
84
70
  .def(py::init<const cpc_union&>())
85
71
  .def("update", (void (cpc_union::*)(const cpc_sketch&)) &cpc_union::update, py::arg("sketch"),
86
72
  "Updates the union with the provided CPC sketch")
87
- .def("get_result", &dspy::cpc_union_get_result,
73
+ .def("get_result", &cpc_union::get_result,
88
74
  "Returns a CPC sketch with the result of the union")
89
75
  ;
90
76
  }
@@ -27,9 +27,12 @@ void init_kll(py::module& m);
27
27
  void init_fi(py::module& m);
28
28
  void init_cpc(py::module& m);
29
29
  void init_theta(py::module& m);
30
+ void init_tuple(py::module& m);
30
31
  void init_vo(py::module& m);
31
32
  void init_req(py::module& m);
32
33
  void init_quantiles(py::module& m);
34
+ void init_count_min(py::module& m);
35
+ void init_density(py::module& m);
33
36
  void init_vector_of_kll(py::module& m);
34
37
 
35
38
  // supporting objects
@@ -42,9 +45,12 @@ PYBIND11_MODULE(_datasketches, m) {
42
45
  init_fi(m);
43
46
  init_cpc(m);
44
47
  init_theta(m);
48
+ init_tuple(m);
45
49
  init_vo(m);
46
50
  init_req(m);
47
51
  init_quantiles(m);
52
+ init_count_min(m);
53
+ init_density(m);
48
54
  init_vector_of_kll(m);
49
55
 
50
56
  init_kolmogorov_smirnov(m);
@@ -0,0 +1,95 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <pybind11/pybind11.h>
21
+ #include <pybind11/stl.h>
22
+ #include <pybind11/numpy.h>
23
+ #include <vector>
24
+
25
+ #include "kernel_function.hpp"
26
+ #include "density_sketch.hpp"
27
+
28
+ namespace py = pybind11;
29
+
30
+ template<typename T, typename K>
31
+ void bind_density_sketch(py::module &m, const char* name) {
32
+ using namespace datasketches;
33
+
34
+ py::class_<density_sketch<T, K>>(m, name)
35
+ .def(
36
+ py::init([](uint16_t k, uint32_t dim, std::shared_ptr<kernel_function> kernel) {
37
+ kernel_function_holder holder(kernel);
38
+ return density_sketch<T, K>(k, dim, holder);
39
+ }),
40
+ py::arg("k"), py::arg("dim"), py::arg("kernel"))
41
+ .def("update", static_cast<void (density_sketch<T, K>::*)(const std::vector<T>&)>(&density_sketch<T, K>::update),
42
+ "Updates the sketch with the given vector")
43
+ .def("merge", static_cast<void (density_sketch<T, K>::*)(const density_sketch<T, K>&)>(&density_sketch<T, K>::merge), py::arg("sketch"),
44
+ "Merges the provided sketch into this one")
45
+ .def("is_empty", &density_sketch<T, K>::is_empty,
46
+ "Returns True if the sketch is empty, otherwise False")
47
+ .def("get_k", &density_sketch<T, K>::get_k,
48
+ "Returns the configured parameter k")
49
+ .def("get_dim", &density_sketch<T, K>::get_dim,
50
+ "Returns the configured parameter dim")
51
+ .def("get_n", &density_sketch<T, K>::get_n,
52
+ "Returns the length of the input stream")
53
+ .def("get_num_retained", &density_sketch<T, K>::get_num_retained,
54
+ "Returns the number of retained items (samples) in the sketch")
55
+ .def("is_estimation_mode", &density_sketch<T, K>::is_estimation_mode,
56
+ "Returns True if the sketch is in estimation mode, otherwise False")
57
+ .def("get_estimate", &density_sketch<T, K>::get_estimate, py::arg("point"),
58
+ "Returns an approximate density at the given point")
59
+ .def("__str__", &density_sketch<T, K>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
60
+ "Produces a string summary of the sketch")
61
+ .def("to_string", &density_sketch<T, K>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
62
+ "Produces a string summary of the sketch")
63
+ .def("__iter__", [](const density_sketch<T, K>& s){ return py::make_iterator(s.begin(), s.end()); })
64
+ .def("serialize",
65
+ [](const density_sketch<T, K>& sk) {
66
+ auto bytes = sk.serialize();
67
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
68
+ },
69
+ "Serializes the sketch into a bytes object"
70
+ )
71
+ .def_static(
72
+ "deserialize",
73
+ [](const std::string& bytes, std::shared_ptr<kernel_function> kernel) {
74
+ kernel_function_holder holder(kernel);
75
+ return density_sketch<T, K>::deserialize(bytes.data(), bytes.size(), holder);
76
+ },
77
+ py::arg("bytes"), py::arg("kernel"),
78
+ "Reads a bytes object and returns the corresponding density_sketch"
79
+ );;
80
+ }
81
+
82
+ void init_density(py::module &m) {
83
+ using namespace datasketches;
84
+
85
+ // generic kernel function
86
+ py::class_<kernel_function, KernelFunction, std::shared_ptr<kernel_function>>(m, "KernelFunction")
87
+ .def(py::init())
88
+ .def("__call__", &kernel_function::operator(), py::arg("a"), py::arg("b"))
89
+ ;
90
+
91
+ // the old sketch names can almost be defined, but the kernel_function_holder won't work in init()
92
+ //bind_density_sketch<float, gaussian_kernel<float>>(m, "density_floats_sketch");
93
+ //bind_density_sketch<double, gaussian_kernel<double>>(m, "density_doubles_sketch");
94
+ bind_density_sketch<double, kernel_function_holder>(m, "_density_sketch");
95
+ }