datasketches 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  7. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  8. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  9. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  10. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  11. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  13. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  14. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  15. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  16. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +2 -2
  73. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  74. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  75. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  76. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  77. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  78. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  79. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  80. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  81. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  82. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  83. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  84. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  86. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  87. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  88. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  89. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  90. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  99. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  101. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  102. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  105. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  107. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  108. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  109. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  110. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  111. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  112. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  113. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  114. metadata +31 -3
@@ -17,194 +17,142 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+ #include "py_object_lt.hpp"
21
+ #include "py_object_ostream.hpp"
22
+ #include "quantile_conditional.hpp"
20
23
  #include "kll_sketch.hpp"
21
24
 
22
25
  #include <pybind11/pybind11.h>
23
26
  #include <pybind11/stl.h>
24
- #include <pybind11/numpy.h>
25
- #include <sstream>
26
27
  #include <vector>
27
28
  #include <stdexcept>
28
29
 
29
30
  namespace py = pybind11;
30
31
 
31
- namespace datasketches {
32
-
33
- namespace python {
34
-
35
- template<typename T>
36
- kll_sketch<T> kll_sketch_deserialize(py::bytes skBytes) {
37
- std::string skStr = skBytes; // implicit cast
38
- return kll_sketch<T>::deserialize(skStr.c_str(), skStr.length());
39
- }
40
-
41
- template<typename T>
42
- py::object kll_sketch_serialize(const kll_sketch<T>& sk) {
43
- auto serResult = sk.serialize();
44
- return py::bytes((char*)serResult.data(), serResult.size());
45
- }
46
-
47
- // maybe possible to disambiguate the static vs method rank error calls, but
48
- // this is easier for now
49
- template<typename T>
50
- double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
51
- return kll_sketch<T>::get_normalized_rank_error(k, pmf);
52
- }
53
-
54
- template<typename T>
55
- py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
56
- std::vector<double>& ranks,
57
- bool inclusive) {
58
- size_t nQuantiles = ranks.size();
59
- auto result = sk.get_quantiles(ranks.data(), nQuantiles, inclusive);
60
- // returning as std::vector<> would copy values to a list anyway
61
- py::list list(nQuantiles);
62
- for (size_t i = 0; i < nQuantiles; ++i) {
63
- list[i] = result[i];
64
- }
65
- return list;
66
- }
67
-
68
- template<typename T>
69
- py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
70
- std::vector<T>& split_points,
71
- bool inclusive) {
72
- size_t nPoints = split_points.size();
73
- auto result = sk.get_PMF(split_points.data(), nPoints, inclusive);
74
- py::list list(nPoints + 1);
75
- for (size_t i = 0; i <= nPoints; ++i) {
76
- list[i] = result[i];
77
- }
78
- return list;
79
- }
80
-
81
- template<typename T>
82
- py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
83
- std::vector<T>& split_points,
84
- bool inclusive) {
85
- size_t nPoints = split_points.size();
86
- auto result = sk.get_CDF(split_points.data(), nPoints, inclusive);
87
- py::list list(nPoints + 1);
88
- for (size_t i = 0; i <= nPoints; ++i) {
89
- list[i] = result[i];
90
- }
91
- return list;
92
- }
93
-
94
- template<typename T>
95
- void kll_sketch_update(kll_sketch<T>& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
96
- if (items.ndim() != 1) {
97
- throw std::invalid_argument("input data must have only one dimension. Found: "
98
- + std::to_string(items.ndim()));
99
- }
100
-
101
- auto data = items.template unchecked<1>();
102
- for (uint32_t i = 0; i < data.size(); ++i) {
103
- sk.update(data(i));
104
- }
105
- }
106
-
107
- }
108
- }
109
-
110
- namespace dspy = datasketches::python;
111
-
112
- template<typename T>
32
+ template<typename T, typename C>
113
33
  void bind_kll_sketch(py::module &m, const char* name) {
114
34
  using namespace datasketches;
115
35
 
116
- py::class_<kll_sketch<T>>(m, name)
36
+ auto kll_class = py::class_<kll_sketch<T, C>>(m, name)
117
37
  .def(py::init<uint16_t>(), py::arg("k")=kll_constants::DEFAULT_K)
118
- .def(py::init<const kll_sketch<T>&>())
119
- .def("update", (void (kll_sketch<T>::*)(const T&)) &kll_sketch<T>::update, py::arg("item"),
120
- "Updates the sketch with the given value")
121
- .def("update", &dspy::kll_sketch_update<T>, py::arg("array"),
122
- "Updates the sketch with the values in the given array")
123
- .def("merge", (void (kll_sketch<T>::*)(const kll_sketch<T>&)) &kll_sketch<T>::merge, py::arg("sketch"),
124
- "Merges the provided sketch into the this one")
125
- .def("__str__", &kll_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
126
- "Produces a string summary of the sketch")
127
- .def("to_string", &kll_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
128
- "Produces a string summary of the sketch")
129
- .def("is_empty", &kll_sketch<T>::is_empty,
130
- "Returns True if the sketch is empty, otherwise False")
131
- .def("get_k", &kll_sketch<T>::get_k,
132
- "Returns the configured parameter k")
133
- .def("get_n", &kll_sketch<T>::get_n,
134
- "Returns the length of the input stream")
135
- .def("get_num_retained", &kll_sketch<T>::get_num_retained,
136
- "Returns the number of retained items (samples) in the sketch")
137
- .def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode,
138
- "Returns True if the sketch is in estimation mode, otherwise False")
139
- .def("get_min_value", &kll_sketch<T>::get_min_item,
140
- "Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
141
- .def("get_max_value", &kll_sketch<T>::get_max_item,
142
- "Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
143
- .def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
144
- "Returns an approximation to the data value "
145
- "associated with the given normalized rank in a hypothetical sorted "
146
- "version of the input stream so far.\n"
147
- "For kll_floats_sketch: if the sketch is empty this returns nan. "
148
- "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
149
- .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
150
- "This returns an array that could have been generated by using get_quantile() for each "
151
- "normalized rank separately.\n"
152
- "If the sketch is empty this returns an empty vector.\n"
153
- "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
154
- .def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
38
+ .def(py::init<const kll_sketch<T, C>&>())
39
+ .def(
40
+ "update",
41
+ static_cast<void (kll_sketch<T, C>::*)(const T&)>(&kll_sketch<T, C>::update),
42
+ py::arg("item"),
43
+ "Updates the sketch with the given value"
44
+ )
45
+ .def("merge", (void (kll_sketch<T, C>::*)(const kll_sketch<T, C>&)) &kll_sketch<T, C>::merge, py::arg("sketch"),
46
+ "Merges the provided sketch into this one")
47
+ .def("__str__", &kll_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
48
+ "Produces a string summary of the sketch")
49
+ .def("to_string", &kll_sketch<T, C>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
50
+ "Produces a string summary of the sketch")
51
+ .def("is_empty", &kll_sketch<T, C>::is_empty,
52
+ "Returns True if the sketch is empty, otherwise False")
53
+ .def("get_k", &kll_sketch<T, C>::get_k,
54
+ "Returns the configured parameter k")
55
+ .def("get_n", &kll_sketch<T, C>::get_n,
56
+ "Returns the length of the input stream")
57
+ .def("get_num_retained", &kll_sketch<T, C>::get_num_retained,
58
+ "Returns the number of retained items (samples) in the sketch")
59
+ .def("is_estimation_mode", &kll_sketch<T, C>::is_estimation_mode,
60
+ "Returns True if the sketch is in estimation mode, otherwise False")
61
+ .def("get_min_value", &kll_sketch<T, C>::get_min_item,
62
+ "Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
63
+ .def("get_max_value", &kll_sketch<T, C>::get_max_item,
64
+ "Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
65
+ .def("get_quantile", &kll_sketch<T, C>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
66
+ "Returns an approximation to the data value "
67
+ "associated with the given normalized rank in a hypothetical sorted "
68
+ "version of the input stream so far.\n"
69
+ "For kll_floats_sketch: if the sketch is empty this returns nan. "
70
+ "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
71
+ .def(
72
+ "get_quantiles",
73
+ [](const kll_sketch<T, C>& sk, const std::vector<double>& ranks, bool inclusive) {
74
+ return sk.get_quantiles(ranks.data(), ranks.size(), inclusive);
75
+ },
76
+ py::arg("ranks"), py::arg("inclusive")=false,
77
+ "This returns an array that could have been generated by using get_quantile() for each "
78
+ "normalized rank separately.\n"
79
+ "If the sketch is empty this returns an empty vector.\n"
80
+ "Deprecated. Will be removed in the next major version. Use get_quantile() instead."
81
+ )
82
+ .def("get_rank", &kll_sketch<T, C>::get_rank, py::arg("value"), py::arg("inclusive")=false,
155
83
  "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
156
84
  "The resulting approximation has a probabilistic guarantee that can be obtained from the "
157
85
  "get_normalized_rank_error(False) function.\n"
158
86
  "With the parameter inclusive=true the weight of the given value is included into the rank."
159
87
  "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
160
88
  "If the sketch is empty this returns nan.")
161
- .def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
162
- "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
163
- "given a set of split points (values).\n"
164
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
165
- "get_normalized_rank_error(True) function.\n"
166
- "If the sketch is empty this returns an empty vector.\n"
167
- "split_points is an array of m unique, monotonically increasing float values "
168
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
169
- "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
170
- "exclusive of the right split point, with the exception that the last interval will include "
171
- "the maximum value.\n"
172
- "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
173
- "inclusive of the right split point.\n"
174
- "It is not necessary to include either the min or max values in these split points.")
175
- .def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
176
- "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
177
- "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
178
- "The resulting approximations have a probabilistic guarantee that can be obtained from the "
179
- "get_normalized_rank_error(True) function.\n"
180
- "If the sketch is empty this returns an empty vector.\n"
181
- "split_points is an array of m unique, monotonically increasing float values "
182
- "that divide the real number line into m+1 consecutive disjoint intervals.\n"
183
- "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
184
- "exclusive of the right split point, with the exception that the last interval will include "
185
- "the maximum value.\n"
186
- "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
187
- "inclusive of the right split point.\n"
188
- "It is not necessary to include either the min or max values in these split points.")
189
- .def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
89
+ .def(
90
+ "get_pmf",
91
+ [](const kll_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
92
+ return sk.get_PMF(split_points.data(), split_points.size(), inclusive);
93
+ },
94
+ py::arg("split_points"), py::arg("inclusive")=false,
95
+ "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
96
+ "given a set of split points (values).\n"
97
+ "The resulting approximations have a probabilistic guarantee that can be obtained from the "
98
+ "get_normalized_rank_error(True) function.\n"
99
+ "If the sketch is empty this returns an empty vector.\n"
100
+ "split_points is an array of m unique, monotonically increasing float values "
101
+ "that divide the real number line into m+1 consecutive disjoint intervals.\n"
102
+ "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
103
+ "exclusive of the right split point, with the exception that the last interval will include "
104
+ "the maximum value.\n"
105
+ "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
106
+ "inclusive of the right split point.\n"
107
+ "It is not necessary to include either the min or max values in these split points."
108
+ )
109
+ .def(
110
+ "get_cdf",
111
+ [](const kll_sketch<T, C>& sk, const std::vector<T>& split_points, bool inclusive) {
112
+ return sk.get_CDF(split_points.data(), split_points.size(), inclusive);
113
+ },
114
+ py::arg("split_points"), py::arg("inclusive")=false,
115
+ "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
116
+ "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
117
+ "The resulting approximations have a probabilistic guarantee that can be obtained from the "
118
+ "get_normalized_rank_error(True) function.\n"
119
+ "If the sketch is empty this returns an empty vector.\n"
120
+ "split_points is an array of m unique, monotonically increasing float values "
121
+ "that divide the real number line into m+1 consecutive disjoint intervals.\n"
122
+ "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
123
+ "exclusive of the right split point, with the exception that the last interval will include "
124
+ "the maximum value.\n"
125
+ "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
126
+ "inclusive of the right split point.\n"
127
+ "It is not necessary to include either the min or max values in these split points."
128
+ )
129
+ .def(
130
+ "normalized_rank_error",
131
+ static_cast<double (kll_sketch<T, C>::*)(bool) const>(&kll_sketch<T, C>::get_normalized_rank_error),
190
132
  py::arg("as_pmf"),
191
133
  "Gets the normalized rank error for this sketch.\n"
192
134
  "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
193
135
  "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
194
- "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
195
- .def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error<T>,
136
+ "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials"
137
+ )
138
+ .def_static(
139
+ "get_normalized_rank_error",
140
+ [](uint16_t k, bool pmf) { return kll_sketch<T, C>::get_normalized_rank_error(k, pmf); },
196
141
  py::arg("k"), py::arg("as_pmf"),
197
142
  "Gets the normalized rank error given parameters k and the pmf flag.\n"
198
143
  "If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
199
144
  "Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
200
- "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
201
- .def("serialize", &dspy::kll_sketch_serialize<T>, "Serializes the sketch into a bytes object")
202
- .def_static("deserialize", &dspy::kll_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
203
- ;
145
+ "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials"
146
+ )
147
+ .def("__iter__", [](const kll_sketch<T, C>& s) { return py::make_iterator(s.begin(), s.end()); });
148
+
149
+ add_serialization<T>(kll_class);
150
+ add_vector_update<T>(kll_class);
204
151
  }
205
152
 
206
153
  void init_kll(py::module &m) {
207
- bind_kll_sketch<int>(m, "kll_ints_sketch");
208
- bind_kll_sketch<float>(m, "kll_floats_sketch");
209
- bind_kll_sketch<double>(m, "kll_doubles_sketch");
154
+ bind_kll_sketch<int, std::less<int>>(m, "kll_ints_sketch");
155
+ bind_kll_sketch<float, std::less<float>>(m, "kll_floats_sketch");
156
+ bind_kll_sketch<double, std::less<double>>(m, "kll_doubles_sketch");
157
+ bind_kll_sketch<py::object, py_object_lt>(m, "kll_items_sketch");
210
158
  }
@@ -27,13 +27,14 @@
27
27
  namespace py = pybind11;
28
28
 
29
29
  void init_serde(py::module& m) {
30
- py::class_<datasketches::py_object_serde, datasketches::PyObjectSerDe /* <--- trampoline*/>(m, "PyObjectSerDe")
30
+ using namespace datasketches;
31
+ py::class_<py_object_serde, PyObjectSerDe /* <--- trampoline*/>(m, "PyObjectSerDe")
31
32
  .def(py::init<>())
32
- .def("get_size", &datasketches::py_object_serde::get_size, py::arg("item"),
33
+ .def("get_size", &py_object_serde::get_size, py::arg("item"),
33
34
  "Returns the size in bytes of an item")
34
- .def("to_bytes", &datasketches::py_object_serde::to_bytes, py::arg("item"),
35
+ .def("to_bytes", &py_object_serde::to_bytes, py::arg("item"),
35
36
  "Retuns a bytes object with a serialized version of an item")
36
- .def("from_bytes", &datasketches::py_object_serde::from_bytes, py::arg("data"), py::arg("offset"),
37
+ .def("from_bytes", &py_object_serde::from_bytes, py::arg("data"), py::arg("offset"),
37
38
  "Reads a bytes object starting from the given offest and returns a tuple of the reconstructed "
38
39
  "object and the number of additional bytes read")
39
40
  ;