RubyGems - datasketches - Versions diffs - 0.2.7 → 0.3.0 - Mend

datasketches 0.2.7 → 0.3.0

Files changed (86) hide show

data/vendor/datasketches-cpp/req/include/req_sketch.hpp CHANGED Viewed

@@ -20,18 +20,17 @@
 #ifndef REQ_SKETCH_HPP_
 #define REQ_SKETCH_HPP_
+#include <iterator>
 #include "req_common.hpp"
 #include "req_compactor.hpp"
-#include "quantile_sketch_sorted_view.hpp"
-#include <stdexcept>
+#include "quantiles_sorted_view.hpp"
 namespace datasketches {
 template<
   typename T,
   typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
-  typename S = serde<T>, // deprecated, to be removed in the next major version
   typename Allocator = std::allocator<T>
 >
 class req_sketch {
@@ -40,7 +39,6 @@ public:
   using comparator = Comparator;
   using Compactor = req_compactor<T, Comparator, Allocator>;
   using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>;
-  using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
   /**
    * Constructor
@@ -48,9 +46,11 @@ public:
    * Value of 12 roughly corresponds to 1% relative error guarantee at 95% confidence.
    * @param hra if true, the default, the high ranks are prioritized for better
    * accuracy. Otherwise the low ranks are prioritized for better accuracy.
+   * @param comparator to use by this instance
    * @param allocator to use by this instance
    */
-  explicit req_sketch(uint16_t k, bool hra = true, const Allocator& allocator = Allocator());
+  explicit req_sketch(uint16_t k, bool hra = true, const Comparator& comparator = Comparator(),
+      const Allocator& allocator = Allocator());
   ~req_sketch();
   req_sketch(const req_sketch& other);
@@ -61,10 +61,12 @@ public:
   /*
    * Type converting constructor.
    * @param other sketch of a different type
+   * @param comparator instance of a Comparator
    * @param allocator instance of an Allocator
    */
-  template<typename TT, typename CC, typename SS, typename AA>
-  explicit req_sketch(const req_sketch<TT, CC, SS, AA>& other, const Allocator& allocator = Allocator());
+  template<typename TT, typename CC, typename AA>
+  explicit req_sketch(const req_sketch<TT, CC, AA>& other, const Comparator& comparator = Comparator(),
+      const Allocator& allocator = Allocator());
   /**
    * Returns configured parameter K
@@ -102,27 +104,33 @@ public:
    */
   bool is_estimation_mode() const;
+  /**
+   * Updates this sketch with the given data item.
+   * @param item from a stream of items
+   */
   template<typename FwdT>
   void update(FwdT&& item);
+  /**
+   * Merges another sketch into this one.
+   * @param other sketch to merge into this one
+   */
   template<typename FwdSk>
   void merge(FwdSk&& other);
   /**
-   * Returns the min value of the stream.
-   * For floating point types: if the sketch is empty this returns NaN.
-   * For other types: if the sketch is empty this throws runtime_error.
-   * @return the min value of the stream
+   * Returns the min item of the stream.
+   * If the sketch is empty this throws std::runtime_error.
+   * @return the min item of the stream
    */
-  const T& get_min_value() const;
+  const T& get_min_item() const;
   /**
-   * Returns the max value of the stream.
-   * For floating point types: if the sketch is empty this returns NaN.
-   * For other types: if the sketch is empty this throws runtime_error.
-   * @return the max value of the stream
+   * Returns the max item of the stream.
+   * If the sketch is empty this throws std::runtime_error.
+   * @return the max item of the stream
    */
-  const T& get_max_value() const;
+  const T& get_max_item() const;
   /**
    * Returns an instance of the comparator for this sketch.
@@ -131,84 +139,99 @@ public:
   Comparator get_comparator() const;
   /**
-   * Returns an approximation to the normalized (fractional) rank of the given item from 0 to 1 inclusive.
-   * With the template parameter inclusive=true the weight of the given item is included into the rank.
-   * Otherwise the rank equals the sum of the weights of items less than the given item according to the Comparator.
+   * Returns an instance of the allocator for this sketch.
+   * @return allocator
+   */
+  Allocator get_allocator() const;
+  /**
+   * Returns an approximation to the normalized rank of the given item from 0 to 1 inclusive.
+   *
+   * <p>If the sketch is empty this throws std::runtime_error.
    *
-   * <p>If the sketch is empty this returns NaN.
+   * @param item to be ranked.
+   * @param inclusive if true the weight of the given item is included into the rank.
+   * Otherwise the rank equals the sum of the weights of all items that are less than the given item
+   * according to the comparator C.
    *
-   * @param item to be ranked
    * @return an approximate rank of the given item
    */
-  template<bool inclusive = false>
-  double get_rank(const T& item) const;
+  double get_rank(const T& item, bool inclusive = true) const;
   /**
    * Returns an approximation to the Probability Mass Function (PMF) of the input stream
-   * given a set of split points (values).
+   * given a set of split points (items).
    *
-   * <p>If the sketch is empty this returns an empty vector.
+   * <p>If the sketch is empty this throws std::runtime_error.
    *
-   * @param split_points an array of <i>m</i> unique, monotonically increasing values
-   * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
-   * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
-   * split point, with the exception that the last interval will include the maximum value.
-   * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
-   * split point.
-   * It is not necessary to include either the min or max values in these split points.
+   * @param split_points an array of <i>m</i> unique, monotonically increasing items
+   * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
+   *
+   * @param size the number of split points in the array
+   *
+   * @param inclusive if true the rank of an item includes its own weight, and therefore
+   * if the sketch contains items equal to a slit point, then in PMF such items are
+   * included into the interval to the left of split point. Otherwise they are included into the interval
+   * to the right of split point.
    *
    * @return an array of m+1 doubles each of which is an approximation
-   * to the fraction of the input stream values (the mass) that fall into one of those intervals.
-   * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
-   * split point, with the exception that the last interval will include the maximum value.
-   * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
-   * split point.
+   * to the fraction of the input stream items (the mass) that fall into one of those intervals.
    */
-  template<bool inclusive = false>
-  vector_double get_PMF(const T* split_points, uint32_t size) const;
+  using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
+  vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
   /**
    * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
-   * cumulative analog of the PMF, of the input stream given a set of split points (values).
+   * cumulative analog of the PMF, of the input stream given a set of split points (items).
    *
-   * <p>If the sketch is empty this returns an empty vector.
+   * <p>If the sketch is empty this throws std::runtime_error.
    *
-   * @param split_points an array of <i>m</i> unique, monotonically increasing float values
+   * @param split_points an array of <i>m</i> unique, monotonically increasing items
    * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
-   * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
-   * split point, with the exception that the last interval will include the maximum value.
-   * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
-   * split point.
-   * It is not necessary to include either the min or max values in these split points.
    *
-   * @return an array of m+1 double values, which are a consecutive approximation to the CDF
+   * @param size the number of split points in the array
+   *
+   * @param inclusive if true the rank of an item includes its own weight, and therefore
+   * if the sketch contains items equal to a slit point, then in CDF such items are
+   * included into the interval to the left of split point. Otherwise they are included into
+   * the interval to the right of split point.
+   *
+   * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
    * of the input stream given the split_points. The value at array position j of the returned
    * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
-   * array.
+   * array. This can be viewed as array of ranks of the given split points plus one more value
+   * that is always 1.
    */
-  template<bool inclusive = false>
-  vector_double get_CDF(const T* split_points, uint32_t size) const;
+  vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
   /**
    * Returns an approximate quantile of the given normalized rank.
    * The normalized rank must be in the range [0.0, 1.0] (both inclusive).
-   * @param rank the given normalized rank
-   * @return approximate quantile given the normalized rank
+   * <p>If the sketch is empty this throws std::runtime_error.
+   *
+   * @param rank of an item in the hypothetical sorted stream.
+   * @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
+   *
+   * @return approximate quantile associated with the given rank
    */
-  using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
-  template<bool inclusive = false>
-  quantile_return_type get_quantile(double rank) const;
+  using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
+  quantile_return_type get_quantile(double rank, bool inclusive = true) const;
   /**
    * Returns an array of quantiles that correspond to the given array of normalized ranks.
+   * <p>If the sketch is empty this throws std::runtime_error.
+   *
    * @param ranks given array of normalized ranks.
+   * @param size the number of ranks in the array.
+   *
    * @return array of quantiles that correspond to the given array of normalized ranks
+   *
+   * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
    */
-  template<bool inclusive = false>
-  std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size) const;
+  std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
   /**
-   * Returns an approximate lower bound of the given noramalized rank.
+   * Returns an approximate lower bound of the given normalized rank.
    * @param rank the given rank, a value between 0 and 1.0.
    * @param num_std_dev the number of standard deviations. Must be 1, 2, or 3.
    * @return an approximate lower bound rank.
@@ -216,7 +239,7 @@ public:
   double get_rank_lower_bound(double rank, uint8_t num_std_dev) const;
   /**
-   * Returns an approximate upper bound of the given noramalized rank.
+   * Returns an approximate upper bound of the given normalized rank.
    * @param rank the given rank, a value between 0 and 1.0.
    * @param num_std_dev the number of standard deviations. Must be 1, 2, or 3.
    * @return an approximate upper bound rank.
@@ -239,27 +262,27 @@ public:
   /**
    * Computes size needed to serialize the current state of the sketch.
    * This version is for fixed-size arithmetic types (integral and floating point).
-   * @param instance of a SerDe
+   * @param sd instance of a SerDe
    * @return size in bytes needed to serialize this sketch
    */
-  template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
+  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
   size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
   /**
    * Computes size needed to serialize the current state of the sketch.
    * This version is for all other types and can be expensive since every item needs to be looked at.
-   * @param instance of a SerDe
+   * @param sd instance of a SerDe
    * @return size in bytes needed to serialize this sketch
    */
-  template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
+  template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
   size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
   /**
    * This method serializes the sketch into a given stream in a binary form
    * @param os output stream
-   * @param instance of a SerDe
+   * @param sd instance of a SerDe
    */
-  template<typename SerDe = S>
+  template<typename SerDe = serde<T>>
   void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
   // This is a convenience alias for users
@@ -272,52 +295,35 @@ public:
    * It is a blank space of a given size.
    * This header is used in Datasketches PostgreSQL extension.
    * @param header_size_bytes space to reserve in front of the sketch
-   * @param instance of a SerDe
+   * @param sd instance of a SerDe
    */
-  template<typename SerDe = S>
+  template<typename SerDe = serde<T>>
   vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
   /**
    * This method deserializes a sketch from a given stream.
    * @param is input stream
-   * @param instance of an Allocator
-   * @return an instance of a sketch
-   *
-   * Deprecated, to be removed in the next major version
-   */
-  static req_sketch deserialize(std::istream& is, const Allocator& allocator = Allocator());
-  /**
-   * This method deserializes a sketch from a given stream.
-   * @param is input stream
-   * @param instance of a SerDe
-   * @param instance of an Allocator
-   * @return an instance of a sketch
-   */
-  template<typename SerDe = S>
-  static req_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
-  /**
-   * This method deserializes a sketch from a given array of bytes.
-   * @param bytes pointer to the array of bytes
-   * @param size the size of the array
-   * @param instance of an Allocator
+   * @param sd instance of a SerDe
+   * @param comparator instance of a Comparator
+   * @param allocator instance of an Allocator
    * @return an instance of a sketch
-   *
-   * Deprecated, to be removed in the next major version
    */
-  static req_sketch deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
+  template<typename SerDe = serde<T>>
+  static req_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
+      const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
   /**
    * This method deserializes a sketch from a given array of bytes.
    * @param bytes pointer to the array of bytes
    * @param size the size of the array
-   * @param instance of a SerDe
-   * @param instance of an Allocator
+   * @param sd instance of a SerDe
+   * @param comparator instance of a Comparator
+   * @param allocator instance of an Allocator
    * @return an instance of a sketch
    */
-  template<typename SerDe = S>
-  static req_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
+  template<typename SerDe = serde<T>>
+  static req_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
+      const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
   /**
    * Prints a summary of the sketch.
@@ -330,10 +336,10 @@ public:
   const_iterator begin() const;
   const_iterator end() const;
-  template<bool inclusive = false>
-  quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;
+  quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const;
 private:
+  Comparator comparator_;
   Allocator allocator_;
   uint16_t k_;
   bool hra_;
@@ -341,8 +347,12 @@ private:
   uint32_t num_retained_;
   uint64_t n_;
   std::vector<Compactor, AllocCompactor> compactors_;
-  T* min_value_;
-  T* max_value_;
+  T* min_item_;
+  T* max_item_;
+  mutable quantiles_sorted_view<T, Comparator, Allocator>* sorted_view_;
+  void setup_sorted_view() const; // modifies mutable state
+  void reset_sorted_view();
   static const bool LAZY_COMPRESSION = false;
@@ -366,75 +376,44 @@ private:
   // for deserialization
   class item_deleter;
-  req_sketch(uint16_t k, bool hra, uint64_t n, std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value, std::vector<Compactor, AllocCompactor>&& compactors);
+  req_sketch(uint16_t k, bool hra, uint64_t n,
+      std::unique_ptr<T, item_deleter> min_item, std::unique_ptr<T, item_deleter> max_item,
+      std::vector<Compactor, AllocCompactor>&& compactors, const Comparator& comparator);
   static void check_preamble_ints(uint8_t preamble_ints, uint8_t num_levels);
   static void check_serial_version(uint8_t serial_version);
   static void check_family_id(uint8_t family_id);
-  // implementations for floating point types
   template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
-  static const TT& get_invalid_value() {
-    static TT value = std::numeric_limits<TT>::quiet_NaN();
-    return value;
+  static inline bool check_update_item(const TT& item) {
+    return !std::isnan(item);
   }
-  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
-  static inline bool check_update_value(const TT& value) {
-    return !std::isnan(value);
-  }
-  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
-  static inline void check_split_points(const T* values, uint32_t size) {
-    for (uint32_t i = 0; i < size ; i++) {
-      if (std::isnan(values[i])) {
-        throw std::invalid_argument("Values must not be NaN");
-      }
-      if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
-        throw std::invalid_argument("Values must be unique and monotonically increasing");
-      }
-    }
-  }
-  // implementations for all other types
   template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
-  static const TT& get_invalid_value() {
-    throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
-  }
-  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
-  static inline bool check_update_value(const TT&) {
+  static inline bool check_update_item(const TT&) {
     return true;
   }
-  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
-  static inline void check_split_points(const T* values, uint32_t size) {
-    for (uint32_t i = 0; i < size ; i++) {
-      if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
-        throw std::invalid_argument("Values must be unique and monotonically increasing");
-      }
-    }
-  }
   // for type converting constructor
-  template<typename TT, typename CC, typename SS, typename AA>
-  friend class req_sketch;
+  template<typename TT, typename CC, typename AA> friend class req_sketch;
 };
-template<typename T, typename C, typename S, typename A>
-class req_sketch<T, C, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
+template<typename T, typename C, typename A>
+class req_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
 public:
+  using value_type = std::pair<const T&, const uint64_t>;
   const_iterator& operator++();
   const_iterator& operator++(int);
   bool operator==(const const_iterator& other) const;
   bool operator!=(const const_iterator& other) const;
-  std::pair<const T&, const uint64_t> operator*() const;
+  const value_type operator*() const;
+  const return_value_holder<value_type> operator->() const;
 private:
   using LevelsIterator = typename std::vector<Compactor, AllocCompactor>::const_iterator;
   LevelsIterator levels_it_;
   LevelsIterator levels_end_;
   const T* compactor_it_;
-  friend class req_sketch<T, C, S, A>;
+  friend class req_sketch<T, C, A>;
   const_iterator(LevelsIterator begin, LevelsIterator end);
 };