RubyGems - datasketches - Versions diffs - 0.2.7 → 0.3.0 - Mend

datasketches 0.2.7 → 0.3.0

Files changed (86) hide show

data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp CHANGED Viewed

@@ -51,39 +51,17 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
   return kll_sketch<T>::get_normalized_rank_error(k, pmf);
 }
-template<typename T>
-double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
-  if (inclusive)
-    return sk.template get_rank<true>(item);
-  else
-    return sk.template get_rank<false>(item);
-}
-template<typename T>
-T kll_sketch_get_quantile(const kll_sketch<T>& sk,
-                          double rank,
-                          bool inclusive) {
-  if (inclusive)
-    return T(sk.template get_quantile<true>(rank));
-  else
-    return T(sk.template get_quantile<false>(rank));
-}
 template<typename T>
 py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
-                                  std::vector<double>& fractions,
+                                  std::vector<double>& ranks,
                                   bool inclusive) {
-  size_t nQuantiles = fractions.size();
-  auto result = inclusive ?
-      sk.template get_quantiles<true>(fractions.data(), nQuantiles)
-    : sk.template get_quantiles<false>(fractions.data(), nQuantiles);
+  size_t nQuantiles = ranks.size();
+  auto result = sk.get_quantiles(ranks.data(), nQuantiles, inclusive);
   // returning as std::vector<> would copy values to a list anyway
   py::list list(nQuantiles);
   for (size_t i = 0; i < nQuantiles; ++i) {
       list[i] = result[i];
   }
   return list;
 }
@@ -92,15 +70,11 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
                             std::vector<T>& split_points,
                             bool inclusive) {
   size_t nPoints = split_points.size();
-  auto result = inclusive ?
-      sk.template get_PMF<true>(split_points.data(), nPoints)
-    : sk.template get_PMF<false>(split_points.data(), nPoints);
+  auto result = sk.get_PMF(split_points.data(), nPoints, inclusive);
   py::list list(nPoints + 1);
   for (size_t i = 0; i <= nPoints; ++i) {
     list[i] = result[i];
   }
   return list;
 }
@@ -109,15 +83,11 @@ py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
                             std::vector<T>& split_points,
                             bool inclusive) {
   size_t nPoints = split_points.size();
-  auto result = inclusive ?
-      sk.template get_CDF<true>(split_points.data(), nPoints)
-    : sk.template get_CDF<false>(split_points.data(), nPoints);
+  auto result = sk.get_CDF(split_points.data(), nPoints, inclusive);
   py::list list(nPoints + 1);
   for (size_t i = 0; i <= nPoints; ++i) {
     list[i] = result[i];
   }
   return list;
 }
@@ -166,29 +136,23 @@ void bind_kll_sketch(py::module &m, const char* name) {
          "Returns the number of retained items (samples) in the sketch")
     .def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode,
          "Returns True if the sketch is in estimation mode, otherwise False")
-    .def("get_min_value", &kll_sketch<T>::get_min_value,
-         "Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
-    .def("get_max_value", &kll_sketch<T>::get_max_value,
-         "Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
-    .def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
-         "Returns an approximation to the value of the data item "
-         "that would be preceded by the given fraction of a hypothetical sorted "
+    .def("get_min_value", &kll_sketch<T>::get_min_item,
+         "Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
+    .def("get_max_value", &kll_sketch<T>::get_max_item,
+         "Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
+    .def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
+         "Returns an approximation to the data value "
+         "associated with the given normalized rank in a hypothetical sorted "
          "version of the input stream so far.\n"
-         "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
-         "so it should not be called multiple times to get different quantiles from the same "
-         "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
          "For kll_floats_sketch: if the sketch is empty this returns nan. "
          "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
-    .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
-         "This is a more efficient multiple-query version of get_quantile().\n"
+    .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
          "This returns an array that could have been generated by using get_quantile() for each "
-         "fractional rank separately, but would be very inefficient. "
-         "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
-         "a single query. It is strongly recommend that this method be used instead of multiple calls "
-         "to get_quantile().\n"
-         "If the sketch is empty this returns an empty vector.")
-    .def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
-         "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
+         "normalized rank separately.\n"
+         "If the sketch is empty this returns an empty vector.\n"
+         "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
+    .def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
+         "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
          "The resulting approximation has a probabilistic guarantee that can be obtained from the "
          "get_normalized_rank_error(False) function.\n"
          "With the parameter inclusive=true the weight of the given value is included into the rank."

data/vendor/datasketches-cpp/python/src/py_serde.cpp ADDED Viewed

@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <cstring>
+#include "memory_operations.hpp"
+#include "py_serde.hpp"
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+void init_serde(py::module& m) {
+  py::class_<datasketches::py_object_serde, datasketches::PyObjectSerDe /* <--- trampoline*/>(m, "PyObjectSerDe")
+    .def(py::init<>())
+    .def("get_size", &datasketches::py_object_serde::get_size, py::arg("item"),
+        "Returns the size in bytes of an item")
+    .def("to_bytes", &datasketches::py_object_serde::to_bytes, py::arg("item"),
+        "Retuns a bytes object with a serialized version of an item")
+    .def("from_bytes", &datasketches::py_object_serde::from_bytes, py::arg("data"), py::arg("offset"),
+        "Reads a bytes object starting from the given offest and returns a tuple of the reconstructed "
+        "object and the number of additional bytes read")
+    ;
+}
+namespace datasketches {
+  size_t py_object_serde::size_of_item(const py::object& item) const {
+    return get_size(item);
+  }
+  size_t py_object_serde::serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const {
+    size_t bytes_written = 0;
+    py::gil_scoped_acquire acquire;
+    for (unsigned i = 0; i < num; ++i) {
+      std::string bytes = to_bytes(items[i]); // implicit cast from py::bytes
+      check_memory_size(bytes_written + bytes.size(), capacity);
+      memcpy(ptr, bytes.c_str(), bytes.size());
+      ptr = static_cast<char*>(ptr) + bytes.size();
+      bytes_written += bytes.size();
+    }
+    py::gil_scoped_release release;
+    return bytes_written;
+  }
+  size_t py_object_serde::deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const {
+    size_t bytes_read = 0;
+    unsigned i = 0;
+    bool failure = false;
+    bool error_from_python = false;
+    py::gil_scoped_acquire acquire;
+    // copy data into bytes only once
+    py::bytes bytes(static_cast<const char*>(ptr), capacity);
+    for (; i < num && !failure; ++i) {
+      py::tuple bytes_and_len;
+      try {
+        bytes_and_len = from_bytes(bytes, bytes_read);
+      } catch (py::error_already_set &e) {
+        failure = true;
+        error_from_python = true;
+        break;
+      }
+      size_t length = py::cast<size_t>(bytes_and_len[1]);
+      if (bytes_read + length > capacity) {
+        bytes_read += length; // use this value to report the error
+        failure = true;
+        break;
+      }
+      new (&items[i]) py::object(py::cast<py::object>(bytes_and_len[0]));
+      ptr = static_cast<const char*>(ptr) + length;
+      bytes_read += length;
+    }
+    if (failure) {
+      // clean up what we've allocated
+      for (unsigned j = 0; j < i; ++j) {
+        items[j].dec_ref();
+      }
+      if (error_from_python) {
+        throw py::value_error("Error reading value in from_bytes");
+      } else {
+        // this next call will throw
+        check_memory_size(bytes_read, capacity);
+      }
+    }
+    py::gil_scoped_release release;
+    return bytes_read;
+  }
+} // namespace datasketches

data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp CHANGED Viewed

@@ -49,41 +49,17 @@ double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
   return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
 }
-template<typename T>
-double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
-                                 const T& item,
-                                 bool inclusive) {
-  if (inclusive)
-    return sk.template get_rank<true>(item);
-  else
-    return sk.template get_rank<false>(item);
-}
-template<typename T>
-T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
-                                double rank,
-                                bool inclusive) {
-  if (inclusive)
-    return T(sk.template get_quantile<true>(rank));
-  else
-    return T(sk.template get_quantile<false>(rank));
-}
 template<typename T>
 py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
-                                        std::vector<double>& fractions,
+                                        std::vector<double>& ranks,
                                         bool inclusive) {
-  size_t n_quantiles = fractions.size();
-  auto result = inclusive
-     ? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
-     : sk.template get_quantiles<false>(&fractions[0], static_cast<uint32_t>(n_quantiles));
+  size_t n_quantiles = ranks.size();
+  auto result = sk.get_quantiles(ranks.data(), static_cast<uint32_t>(n_quantiles), inclusive);
   // returning as std::vector<> would copy values to a list anyway
   py::list list(n_quantiles);
   for (size_t i = 0; i < n_quantiles; ++i) {
       list[i] = result[i];
   }
   return list;
 }
@@ -92,15 +68,11 @@ py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
                                   std::vector<T>& split_points,
                                   bool inclusive) {
   size_t n_points = split_points.size();
-  auto result = inclusive
-     ? sk.template get_PMF<true>(&split_points[0], n_points)
-     : sk.template get_PMF<false>(&split_points[0], n_points);
+  auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
   py::list list(n_points + 1);
   for (size_t i = 0; i <= n_points; ++i) {
     list[i] = result[i];
   }
   return list;
 }
@@ -109,15 +81,11 @@ py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
                                   std::vector<T>& split_points,
                                   bool inclusive) {
   size_t n_points = split_points.size();
-  auto result = inclusive
-     ? sk.template get_CDF<true>(&split_points[0], n_points)
-     : sk.template get_CDF<false>(&split_points[0], n_points);
+  auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
   py::list list(n_points + 1);
   for (size_t i = 0; i <= n_points; ++i) {
     list[i] = result[i];
   }
   return list;
 }
@@ -166,31 +134,27 @@ void bind_quantiles_sketch(py::module &m, const char* name) {
          "Returns the number of retained items (samples) in the sketch")
     .def("is_estimation_mode", &quantiles_sketch<T>::is_estimation_mode,
          "Returns True if the sketch is in estimation mode, otherwise False")
-    .def("get_min_value", &quantiles_sketch<T>::get_min_value,
+    .def("get_min_value", &quantiles_sketch<T>::get_min_item,
          "Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
-    .def("get_max_value", &quantiles_sketch<T>::get_max_value,
+    .def("get_max_value", &quantiles_sketch<T>::get_max_item,
          "Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
-    .def("get_quantile", &dspy::quantiles_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
-         "Returns an approximation to the value of the data item "
-         "that would be preceded by the given fraction of a hypothetical sorted "
+    .def("get_quantile", &quantiles_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
+         "Returns an approximation to the data value "
+         "associated with the given rank in a hypothetical sorted "
          "version of the input stream so far.\n"
-         "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
-         "so it should not be called multiple times to get different quantiles from the same "
-         "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
          "For quantiles_floats_sketch: if the sketch is empty this returns nan. "
          "For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.")
     .def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
-         "This is a more efficient multiple-query version of get_quantile().\n"
          "This returns an array that could have been generated by using get_quantile() for each "
-         "fractional rank separately, but would be very inefficient. "
-         "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
-         "a single query. It is strongly recommend that this method be used instead of multiple calls "
-         "to get_quantile().\n"
-         "If the sketch is empty this returns an empty vector.")
-    .def("get_rank", &dspy::quantiles_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
-         "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
+         "normalized rank separately.\n"
+         "If the sketch is empty this returns an empty vector.\n"
+         "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
+    .def("get_rank", &quantiles_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
+         "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
          "The resulting approximation has a probabilistic guarantee that can be obtained from the "
          "get_normalized_rank_error(False) function.\n"
+         "With the parameter inclusive=true the weight of the given value is included into the rank."
+         "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
          "If the sketch is empty this returns nan.")
     .def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
          "Returns an approximation to the Probability Mass Function (PMF) of the input stream "

data/vendor/datasketches-cpp/python/src/req_wrapper.cpp CHANGED Viewed

@@ -51,41 +51,17 @@ double req_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
   return req_sketch<T>::get_normalized_rank_error(k, pmf);
 }
-template<typename T>
-double req_sketch_get_rank(const req_sketch<T>& sk,
-                           const T& item,
-                           bool inclusive) {
-  if (inclusive)
-    return sk.template get_rank<true>(item);
-  else
-    return sk.template get_rank<false>(item);
-}
-template<typename T>
-T req_sketch_get_quantile(const req_sketch<T>& sk,
-                          double rank,
-                          bool inclusive) {
-  if (inclusive)
-    return T(sk.template get_quantile<true>(rank));
-  else
-    return T(sk.template get_quantile<false>(rank));
-}
 template<typename T>
 py::list req_sketch_get_quantiles(const req_sketch<T>& sk,
-                                  std::vector<double>& fractions,
+                                  std::vector<double>& ranks,
                                   bool inclusive) {
-  size_t n_quantiles = fractions.size();
-  auto result = inclusive
-     ? sk.template get_quantiles<true>(&fractions[0], n_quantiles)
-     : sk.template get_quantiles<false>(&fractions[0], n_quantiles);
+  size_t n_quantiles = ranks.size();
+  auto result = sk.get_quantiles(ranks.data(), n_quantiles, inclusive);
   // returning as std::vector<> would copy values to a list anyway
   py::list list(n_quantiles);
   for (size_t i = 0; i < n_quantiles; ++i) {
       list[i] = result[i];
   }
   return list;
 }
@@ -94,15 +70,11 @@ py::list req_sketch_get_pmf(const req_sketch<T>& sk,
                             std::vector<T>& split_points,
                             bool inclusive) {
   size_t n_points = split_points.size();
-  auto result = inclusive
-     ? sk.template get_PMF<true>(&split_points[0], n_points)
-     : sk.template get_PMF<false>(&split_points[0], n_points);
+  auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
   py::list list(n_points + 1);
   for (size_t i = 0; i <= n_points; ++i) {
     list[i] = result[i];
   }
   return list;
 }
@@ -111,15 +83,11 @@ py::list req_sketch_get_cdf(const req_sketch<T>& sk,
                             std::vector<T>& split_points,
                             bool inclusive) {
   size_t n_points = split_points.size();
-  auto result = inclusive
-     ? sk.template get_CDF<true>(&split_points[0], n_points)
-     : sk.template get_CDF<false>(&split_points[0], n_points);
+  auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
   py::list list(n_points + 1);
   for (size_t i = 0; i <= n_points; ++i) {
     list[i] = result[i];
   }
   return list;
 }
@@ -170,33 +138,27 @@ void bind_req_sketch(py::module &m, const char* name) {
          "Returns the number of retained items (samples) in the sketch")
     .def("is_estimation_mode", &req_sketch<T>::is_estimation_mode,
          "Returns True if the sketch is in estimation mode, otherwise False")
-    .def("get_min_value", &req_sketch<T>::get_min_value,
+    .def("get_min_value", &req_sketch<T>::get_min_item,
          "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
-    .def("get_max_value", &req_sketch<T>::get_max_value,
+    .def("get_max_value", &req_sketch<T>::get_max_item,
          "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
-    .def("get_quantile", &dspy::req_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
-         "Returns an approximation to the value of the data item "
-         "that would be preceded by the given fraction of a hypothetical sorted "
+    .def("get_quantile", &req_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
+         "Returns an approximation to the data value "
+         "associated with the given normalized rank in a hypothetical sorted "
          "version of the input stream so far.\n"
-         "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
-         "so it should not be called multiple times to get different quantiles from the same "
-         "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
          "For req_floats_sketch: if the sketch is empty this returns nan. "
          "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
     .def("get_quantiles", &dspy::req_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
-         "This is a more efficient multiple-query version of get_quantile().\n"
          "This returns an array that could have been generated by using get_quantile() for each "
-         "fractional rank separately, but would be very inefficient. "
-         "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
-         "a single query. It is strongly recommend that this method be used instead of multiple calls "
-         "to get_quantile().\n"
-         "If the sketch is empty this returns an empty vector.")
-    .def("get_rank", &dspy::req_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
-         "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
+         "normalized rank separately.\n"
+         "If the sketch is empty this returns an empty vector.\n"
+         "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
+    .def("get_rank", &req_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
+         "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
          "The resulting approximation has a probabilistic guarantee that can be obtained from the "
          "get_normalized_rank_error(False) function.\n"
-         "With the parameter inclusive=true the weight of the given item is included into the rank."
-         "Otherwise the rank equals the sum of the weights of items less than the given item.\n"
+         "With the parameter inclusive=true the weight of the given value is included into the rank."
+         "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
          "If the sketch is empty this returns nan.")
     .def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
          "Returns an approximation to the Probability Mass Function (PMF) of the input stream "