datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -17,199 +17,138 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+ #include "py_object_lt.hpp"
21
+ #include "py_object_ostream.hpp"
22
+ #include "quantile_conditional.hpp"
20
23
  #include "req_sketch.hpp"
21
24
 
22
25
  #include <pybind11/pybind11.h>
23
26
  #include <pybind11/stl.h>
24
27
  #include <pybind11/numpy.h>
25
- #include <sstream>
26
28
  #include <vector>
27
29
  #include <stdexcept>
28
30
 
29
31
  namespace py = pybind11;
30
32
 
31
- namespace datasketches {
32
-
33
- namespace python {
34
-
35
- template<typename T>
36
- req_sketch<T> req_sketch_deserialize(py::bytes sk_bytes) {
37
- std::string sk_str = sk_bytes; // implicit cast
38
- return req_sketch<T>::deserialize(sk_str.c_str(), sk_str.length());
39
- }
40
-
41
- template<typename T>
42
- py::object req_sketch_serialize(const req_sketch<T>& sk) {
43
- auto ser_result = sk.serialize();
44
- return py::bytes((char*)ser_result.data(), ser_result.size());
45
- }
46
-
47
- // maybe possible to disambiguate the static vs method rank error calls, but
48
- // this is easier for now
49
- template<typename T>
50
- double req_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
51
- return req_sketch<T>::get_normalized_rank_error(k, pmf);
52
- }
53
-
54
- template<typename T>
55
- py::list req_sketch_get_quantiles(const req_sketch<T>& sk,
56
- std::vector<double>& ranks,
57
- bool inclusive) {
58
- size_t n_quantiles = ranks.size();
59
- auto result = sk.get_quantiles(ranks.data(), n_quantiles, inclusive);
60
- // returning as std::vector<> would copy values to a list anyway
61
- py::list list(n_quantiles);
62
- for (size_t i = 0; i < n_quantiles; ++i) {
63
- list[i] = result[i];
64
- }
65
- return list;
66
- }
67
-
68
- template<typename T>
69
- py::list req_sketch_get_pmf(const req_sketch<T>& sk,
70
- std::vector<T>& split_points,
71
- bool inclusive) {
72
- size_t n_points = split_points.size();
73
- auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
74
- py::list list(n_points + 1);
75
- for (size_t i = 0; i <= n_points; ++i) {
76
- list[i] = result[i];
77
- }
78
- return list;
79
- }
80
-
81
- template<typename T>
82
- py::list req_sketch_get_cdf(const req_sketch<T>& sk,
83
- std::vector<T>& split_points,
84
- bool inclusive) {
85
- size_t n_points = split_points.size();
86
- auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
87
- py::list list(n_points + 1);
88
- for (size_t i = 0; i <= n_points; ++i) {
89
- list[i] = result[i];
90
- }
91
- return list;
92
- }
93
-
94
- template<typename T>
95
- void req_sketch_update(req_sketch<T>& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
96
- if (items.ndim() != 1) {
97
- throw std::invalid_argument("input data must have only one dimension. Found: "
98
- + std::to_string(items.ndim()));
99
- }
100
-
101
- auto data = items.template unchecked<1>();
102
- for (uint32_t i = 0; i < data.size(); ++i) {
103
- sk.update(data(i));
104
- }
105
- }
106
-
107
- }
108
- }
109
-
110
- namespace dspy = datasketches::python;
111
-
112
- template<typename T>
33
+ template<typename T, typename C>
113
34
  void bind_req_sketch(py::module &m, const char* name) {
114
35
  using namespace datasketches;
115
36
 
116
- py::class_<req_sketch<T>>(m, name)
37
+ auto req_class = py::class_<req_sketch<T, C>>(m, name)
117
38
  .def(py::init<uint16_t, bool>(), py::arg("k")=12, py::arg("is_hra")=true)
118
- .def(py::init<const req_sketch<T>&>())
119
- .def("update", (void (req_sketch<T>::*)(const T&)) &req_sketch<T>::update, py::arg("item"),
120
- "Updates the sketch with the given value")
121
- .def("update", &dspy::req_sketch_update<T>, py::arg("array"),
122
- "Updates the sketch with the values in the given array")
123
- .def("merge", (void (req_sketch<T>::*)(const req_sketch<T>&)) &req_sketch<T>::merge, py::arg("sketch"),
124
- "Merges the provided sketch into the this one")
125
- .def("__str__", &req_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
126
- "Produces a string summary of the sketch")
127
- .def("to_string", &req_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
128
- "Produces a string summary of the sketch")
129
- .def("is_hra", &req_sketch<T>::is_HRA,
130
- "Returns True if the sketch is in High Rank Accuracy mode, otherwise False")
131
- .def("is_empty", &req_sketch<T>::is_empty,
132
- "Returns True if the sketch is empty, otherwise False")
133
- .def("get_k", &req_sketch<T>::get_k,
134
- "Returns the configured parameter k")
135
- .def("get_n", &req_sketch<T>::get_n,
136
- "Returns the length of the input stream")
137
- .def("get_num_retained", &req_sketch<T>::get_num_retained,
138
- "Returns the number of retained items (samples) in the sketch")
139
- .def("is_estimation_mode", &req_sketch<T>::is_estimation_mode,
140
- "Returns True if the sketch is in estimation mode, otherwise False")
141
- .def("get_min_value", &req_sketch<T>::get_min_item,
142
- "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
143
- .def("get_max_value", &req_sketch<T>::get_max_item,
144
- "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
145
- .def("get_quantile", &req_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
146
- "Returns an approximation to the data value "
147
- "associated with the given normalized rank in a hypothetical sorted "
148
- "version of the input stream so far.\n"
149
- "For req_floats_sketch: if the sketch is empty this returns nan. "
150
- "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
151
- .def("get_quantiles", &dspy::req_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
152
- "This returns an array that could have been generated by using get_quantile() for each "
153
- "normalized rank separately.\n"
154
- "If the sketch is empty this returns an empty vector.\n"
155
- "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
156
- .def("get_rank", &req_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
157
- "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
158
- "The resulting approximation has a probabilistic guarantee that can be obtained from the "
159
- "get_normalized_rank_error(False) function.\n"
160
- "With the parameter inclusive=true the weight of the given value is included into the rank."
161
- "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
162
- "If the sketch is empty this returns nan.")
163
- .def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
164
- "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
165
- "given a set of split points (values).\n"
166
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
167
- "get_normalized_rank_error(True) function.\n"
168
- "If the sketch is empty this returns an empty vector.\n"
169
- "split_points is an array of m unique, monotonically increasing float values "
170
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
171
- "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
172
- "exclusive of the right split point, with the exception that the last interval will include "
173
- "the maximum value.\n"
174
- "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
175
- "inclusive of the right split point.\n"
176
- "It is not necessary to include either the min or max values in these split points.")
177
- .def("get_cdf", &dspy::req_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
178
- "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
179
- "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
180
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
181
- "get_normalized_rank_error(True) function.\n"
182
- "If the sketch is empty this returns an empty vector.\n"
183
- "split_points is an array of m unique, monotonically increasing float values "
184
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
185
- "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
186
- "exclusive of the right split point, with the exception that the last interval will include "
187
- "the maximum value.\n"
188
- "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
189
- "inclusive of the right split point.\n"
190
- "It is not necessary to include either the min or max values in these split points.")
191
- .def("get_rank_lower_bound", &req_sketch<T>::get_rank_lower_bound, py::arg("rank"), py::arg("num_std_dev"),
192
- "Returns an approximate lower bound on the given normalized rank.\n"
193
- "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
194
- "the number of standard deviations must be 1, 2, or 3.")
195
- .def("get_rank_upper_bound", &req_sketch<T>::get_rank_upper_bound, py::arg("rank"), py::arg("num_std_dev"),
196
- "Returns an approximate upper bound on the given normalized rank.\n"
197
- "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
198
- "the number of standard deviations must be 1, 2, or 3.")
199
- .def_static("get_RSE", &req_sketch<T>::get_RSE,
200
- py::arg("k"), py::arg("rank"), py::arg("is_hra"), py::arg("n"),
201
- "Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). "
202
- "Derived from Lemma 12 in http://arxiv.org/abs/2004.01668v2, but the constant factors have been "
203
- "modified based on empirical measurements, for a given value of parameter k.\n"
204
- "Normalized rank must be a value between 0.0 and 1.0 (inclusive). If is_hra is True, uses high "
205
- "rank accuracy mode, else low rank accuracy. N is an estimate of the total number of points "
206
- "provided to the sketch.")
207
- .def("serialize", &dspy::req_sketch_serialize<T>, "Serializes the sketch into a bytes object")
208
- .def_static("deserialize", &dspy::req_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
209
- ;
39
+ .def(py::init<const req_sketch<T, C>&>())
40
+ .def("update", (void (req_sketch<T, C>::*)(const T&)) &req_sketch<T, C>::update, py::arg("item"),
41
+ "Updates the sketch with the given value")
42
+ .def("merge", (void (req_sketch<T, C>::*)(const req_sketch<T, C>&)) &req_sketch<T, C>::merge, py::arg("sketch"),
43
+ "Merges the provided sketch into this one")
44
+ .def("__str__", &req_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
45
+ "Produces a string summary of the sketch")
46
+ .def("to_string", &req_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
47
+ "Produces a string summary of the sketch")
48
+ .def("is_hra", &req_sketch<T, C>::is_HRA,
49
+ "Returns True if the sketch is in High Rank Accuracy mode, otherwise False")
50
+ .def("is_empty", &req_sketch<T, C>::is_empty,
51
+ "Returns True if the sketch is empty, otherwise False")
52
+ .def("get_k", &req_sketch<T, C>::get_k,
53
+ "Returns the configured parameter k")
54
+ .def("get_n", &req_sketch<T, C>::get_n,
55
+ "Returns the length of the input stream")
56
+ .def("get_num_retained", &req_sketch<T, C>::get_num_retained,
57
+ "Returns the number of retained items (samples) in the sketch")
58
+ .def("is_estimation_mode", &req_sketch<T, C>::is_estimation_mode,
59
+ "Returns True if the sketch is in estimation mode, otherwise False")
60
+ .def("get_min_value", &req_sketch<T, C>::get_min_item,
61
+ "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
62
+ .def("get_max_value", &req_sketch<T, C>::get_max_item,
63
+ "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
64
+ .def("get_quantile", &req_sketch<T, C>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
65
+ "Returns an approximation to the data value "
66
+ "associated with the given normalized rank in a hypothetical sorted "
67
+ "version of the input stream so far.\n"
68
+ "For req_floats_sketch: if the sketch is empty this returns nan. "
69
+ "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
70
+ .def(
71
+ "get_quantiles",
72
+ [](const req_sketch<T, C>& sk, const std::vector<double>& ranks, bool inclusive) {
73
+ return sk.get_quantiles(ranks.data(), ranks.size(), inclusive);
74
+ },
75
+ py::arg("ranks"), py::arg("inclusive")=false,
76
+ "This returns an array that could have been generated by using get_quantile() for each "
77
+ "normalized rank separately.\n"
78
+ "If the sketch is empty this returns an empty vector.\n"
79
+ "Deprecated. Will be removed in the next major version. Use get_quantile() instead."
80
+ )
81
+ .def("get_rank", &req_sketch<T, C>::get_rank, py::arg("value"), py::arg("inclusive")=false,
82
+ "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
83
+ "The resulting approximation has a probabilistic guarantee that can be obtained from the "
84
+ "get_normalized_rank_error(False) function.\n"
85
+ "With the parameter inclusive=true the weight of the given value is included into the rank."
86
+ "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
87
+ "If the sketch is empty this returns nan.")
88
+ .def(
89
+ "get_pmf",
90
+ [](const req_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
91
+ return sk.get_PMF(split_points.data(), split_points.size(), inclusive);
92
+ },
93
+ py::arg("split_points"), py::arg("inclusive")=false,
94
+ "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
95
+ "given a set of split points (values).\n"
96
+ "The resulting approximations have a probabilistic guarantee that can be obtained from the "
97
+ "get_normalized_rank_error(True) function.\n"
98
+ "If the sketch is empty this returns an empty vector.\n"
99
+ "split_points is an array of m unique, monotonically increasing float values "
100
+ "that divide the real number line into m+1 consecutive disjoint intervals.\n"
101
+ "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
102
+ "exclusive of the right split point, with the exception that the last interval will include "
103
+ "the maximum value.\n"
104
+ "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
105
+ "inclusive of the right split point.\n"
106
+ "It is not necessary to include either the min or max values in these split points."
107
+ )
108
+ .def(
109
+ "get_cdf",
110
+ [](const req_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
111
+ return sk.get_CDF(split_points.data(), split_points.size(), inclusive);
112
+ },
113
+ py::arg("split_points"), py::arg("inclusive")=false,
114
+ "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
115
+ "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
116
+ "The resulting approximations have a probabilistic guarantee that can be obtained from the "
117
+ "get_normalized_rank_error(True) function.\n"
118
+ "If the sketch is empty this returns an empty vector.\n"
119
+ "split_points is an array of m unique, monotonically increasing float values "
120
+ "that divide the real number line into m+1 consecutive disjoint intervals.\n"
121
+ "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
122
+ "exclusive of the right split point, with the exception that the last interval will include "
123
+ "the maximum value.\n"
124
+ "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
125
+ "inclusive of the right split point.\n"
126
+ "It is not necessary to include either the min or max values in these split points."
127
+ )
128
+ .def("get_rank_lower_bound", &req_sketch<T, C>::get_rank_lower_bound, py::arg("rank"), py::arg("num_std_dev"),
129
+ "Returns an approximate lower bound on the given normalized rank.\n"
130
+ "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
131
+ "the number of standard deviations must be 1, 2, or 3.")
132
+ .def("get_rank_upper_bound", &req_sketch<T, C>::get_rank_upper_bound, py::arg("rank"), py::arg("num_std_dev"),
133
+ "Returns an approximate upper bound on the given normalized rank.\n"
134
+ "Normalized rank must be a value between 0.0 and 1.0 (inclusive); "
135
+ "the number of standard deviations must be 1, 2, or 3.")
136
+ .def_static("get_RSE", &req_sketch<T, C>::get_RSE,
137
+ py::arg("k"), py::arg("rank"), py::arg("is_hra"), py::arg("n"),
138
+ "Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]). "
139
+ "Derived from Lemma 12 in http://arxiv.org/abs/2004.01668v2, but the constant factors have been "
140
+ "modified based on empirical measurements, for a given value of parameter k.\n"
141
+ "Normalized rank must be a value between 0.0 and 1.0 (inclusive). If is_hra is True, uses high "
142
+ "rank accuracy mode, else low rank accuracy. N is an estimate of the total number of points "
143
+ "provided to the sketch.")
144
+ .def("__iter__", [](const req_sketch<T, C>& s) { return py::make_iterator(s.begin(), s.end()); });
145
+
146
+ add_serialization<T>(req_class);
147
+ add_vector_update<T>(req_class);
210
148
  }
211
149
 
212
150
  void init_req(py::module &m) {
213
- bind_req_sketch<int>(m, "req_ints_sketch");
214
- bind_req_sketch<float>(m, "req_floats_sketch");
151
+ bind_req_sketch<int, std::less<int>>(m, "req_ints_sketch");
152
+ bind_req_sketch<float, std::less<float>>(m, "req_floats_sketch");
153
+ bind_req_sketch<py::object, py_object_lt>(m, "req_items_sketch");
215
154
  }
@@ -17,7 +17,6 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #include <sstream>
21
20
  #include <pybind11/pybind11.h>
22
21
  #include <pybind11/stl.h>
23
22
 
@@ -28,51 +27,8 @@
28
27
  #include "theta_jaccard_similarity.hpp"
29
28
  #include "common_defs.hpp"
30
29
 
31
-
32
30
  namespace py = pybind11;
33
31
 
34
- namespace datasketches {
35
- namespace python {
36
-
37
- update_theta_sketch update_theta_sketch_factory(uint8_t lg_k, double p, uint64_t seed) {
38
- update_theta_sketch::builder builder;
39
- builder.set_lg_k(lg_k);
40
- builder.set_p(p);
41
- builder.set_seed(seed);
42
- return builder.build();
43
- }
44
-
45
- theta_union theta_union_factory(uint8_t lg_k, double p, uint64_t seed) {
46
- theta_union::builder builder;
47
- builder.set_lg_k(lg_k);
48
- builder.set_p(p);
49
- builder.set_seed(seed);
50
- return builder.build();
51
- }
52
-
53
- uint16_t theta_sketch_get_seed_hash(const theta_sketch& sk) {
54
- return sk.get_seed_hash();
55
- }
56
-
57
- py::object compact_theta_sketch_serialize(const compact_theta_sketch& sk) {
58
- auto serResult = sk.serialize();
59
- return py::bytes((char*)serResult.data(), serResult.size());
60
- }
61
-
62
- compact_theta_sketch compact_theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
63
- std::string skStr = skBytes; // implicit cast
64
- return compact_theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed);
65
- }
66
-
67
- py::list theta_jaccard_sim_computation(const theta_sketch& sketch_a, const theta_sketch& sketch_b, uint64_t seed) {
68
- return py::cast(theta_jaccard_similarity::jaccard(sketch_a, sketch_b, seed));
69
- }
70
-
71
- }
72
- }
73
-
74
- namespace dspy = datasketches::python;
75
-
76
32
  void init_theta(py::module &m) {
77
33
  using namespace datasketches;
78
34
 
@@ -93,17 +49,24 @@ void init_theta(py::module &m) {
93
49
  "Returns True if sketch is in estimation mode, otherwise False")
94
50
  .def("get_theta", &theta_sketch::get_theta,
95
51
  "Returns theta (effective sampling rate) as a fraction from 0 to 1")
52
+ .def("get_theta64", &theta_sketch::get_theta64,
53
+ "Returns theta as 64-bit value")
96
54
  .def("get_num_retained", &theta_sketch::get_num_retained,
97
- "Retunrs the number of items currently in the sketch")
98
- .def("get_seed_hash", &dspy::theta_sketch_get_seed_hash,
55
+ "Returns the number of items currently in the sketch")
56
+ .def("get_seed_hash", &theta_sketch::get_seed_hash,
99
57
  "Returns a hash of the seed used in the sketch")
100
58
  .def("is_ordered", &theta_sketch::is_ordered,
101
59
  "Returns True if the sketch entries are sorted, otherwise False")
60
+ .def("__iter__", [](const theta_sketch& s) { return py::make_iterator(s.begin(), s.end()); })
102
61
  ;
103
62
 
104
63
  py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
105
- .def(py::init(&dspy::update_theta_sketch_factory),
106
- py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
64
+ .def(
65
+ py::init([](uint8_t lg_k, double p, uint64_t seed) {
66
+ return update_theta_sketch::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build();
67
+ }),
68
+ py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED
69
+ )
107
70
  .def(py::init<const update_theta_sketch&>())
108
71
  .def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"),
109
72
  "Updates the sketch with the given integral value")
@@ -118,16 +81,30 @@ void init_theta(py::module &m) {
118
81
  py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
119
82
  .def(py::init<const compact_theta_sketch&>())
120
83
  .def(py::init<const theta_sketch&, bool>())
121
- .def("serialize", &dspy::compact_theta_sketch_serialize,
122
- "Serializes the sketch into a bytes object")
123
- .def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
84
+ .def(
85
+ "serialize",
86
+ [](const compact_theta_sketch& sk) {
87
+ auto bytes = sk.serialize();
88
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
89
+ },
90
+ "Serializes the sketch into a bytes object"
91
+ )
92
+ .def_static(
93
+ "deserialize",
94
+ [](const std::string& bytes, uint64_t seed) {
95
+ return compact_theta_sketch::deserialize(bytes.data(), bytes.size(), seed);
96
+ },
124
97
  py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
125
- "Reads a bytes object and returns the corresponding compact_theta_sketch")
126
- ;
98
+ "Reads a bytes object and returns the corresponding compact_theta_sketch"
99
+ );
127
100
 
128
101
  py::class_<theta_union>(m, "theta_union")
129
- .def(py::init(&dspy::theta_union_factory),
130
- py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
102
+ .def(
103
+ py::init([](uint8_t lg_k, double p, uint64_t seed) {
104
+ return theta_union::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build();
105
+ }),
106
+ py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED
107
+ )
131
108
  .def("update", &theta_union::update<const theta_sketch&>, py::arg("sketch"),
132
109
  "Updates the union with the given sketch")
133
110
  .def("get_result", &theta_union::get_result, py::arg("ordered")=true,
@@ -147,26 +124,43 @@ void init_theta(py::module &m) {
147
124
 
148
125
  py::class_<theta_a_not_b>(m, "theta_a_not_b")
149
126
  .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
150
- .def("compute", &theta_a_not_b::compute<const theta_sketch&, const theta_sketch&>, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
151
- "Returns a sketch with the reuslt of appying the A-not-B operation on the given inputs")
127
+ .def(
128
+ "compute",
129
+ &theta_a_not_b::compute<const theta_sketch&, const theta_sketch&>,
130
+ py::arg("a"), py::arg("b"), py::arg("ordered")=true,
131
+ "Returns a sketch with the result of applying the A-not-B operation on the given inputs"
132
+ )
152
133
  ;
153
134
 
154
135
  py::class_<theta_jaccard_similarity>(m, "theta_jaccard_similarity")
155
- .def_static("jaccard", &dspy::theta_jaccard_sim_computation,
156
- py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
157
- "Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches")
158
- .def_static("exactly_equal", &theta_jaccard_similarity::exactly_equal<const theta_sketch&, const theta_sketch&>,
159
- py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
160
- "Returns True if sketch_a and sketch_b are equivalent, otherwise False")
161
- .def_static("similarity_test", &theta_jaccard_similarity::similarity_test<const theta_sketch&, const theta_sketch&>,
162
- py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
163
- "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
164
- "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
165
- "to be similar sith a confidence of 97.7% and returns True, otherwise False.")
166
- .def_static("dissimilarity_test", &theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, const theta_sketch&>,
167
- py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
168
- "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
169
- "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
170
- "to be dissimilar sith a confidence of 97.7% and returns True, otherwise False.")
136
+ .def_static(
137
+ "jaccard",
138
+ [](const theta_sketch& sketch_a, const theta_sketch& sketch_b, uint64_t seed) {
139
+ return theta_jaccard_similarity::jaccard(sketch_a, sketch_b, seed);
140
+ },
141
+ py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
142
+ "Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches"
143
+ )
144
+ .def_static(
145
+ "exactly_equal",
146
+ &theta_jaccard_similarity::exactly_equal<const theta_sketch&, const theta_sketch&>,
147
+ py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
148
+ "Returns True if sketch_a and sketch_b are equivalent, otherwise False"
149
+ )
150
+ .def_static(
151
+ "similarity_test",
152
+ &theta_jaccard_similarity::similarity_test<const theta_sketch&, const theta_sketch&>,
153
+ py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
154
+ "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
155
+ "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered "
156
+ "to be similar with a confidence of 97.7% and returns True, otherwise False.")
157
+ .def_static(
158
+ "dissimilarity_test",
159
+ &theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, const theta_sketch&>,
160
+ py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED,
161
+ "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard "
162
+ "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered "
163
+ "to be dissimilar with a confidence of 97.7% and returns True, otherwise False."
164
+ )
171
165
  ;
172
166
  }