RubyGems - datasketches - Versions diffs - 0.4.3 → 0.5.0 - Mend

datasketches 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp ADDED Viewed

@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <catch2/catch.hpp>
+#include "bloom_filter.hpp"
+#ifdef TEST_BINARY_INPUT_PATH
+static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
+#else
+static std::string testBinaryInputPath = "test/";
+#endif
+namespace datasketches {
+TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
+  REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
+  REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
+  REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
+}
+TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
+  uint64_t num_items = 4000;
+  double fpp = 0.01;
+  uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
+  uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
+  uint64_t seed = 89023;
+  auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
+  uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
+  REQUIRE(bf.get_capacity() == adjusted_num_bits);
+  REQUIRE(bf.get_num_hashes() == num_hashes);
+  REQUIRE(bf.get_seed() == seed);
+  REQUIRE(bf.is_empty());
+  // should match above
+  bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
+  REQUIRE(bf.get_capacity() == adjusted_num_bits);
+  REQUIRE(bf.get_num_hashes() == num_hashes);
+  REQUIRE(bf.get_seed() == seed);
+  REQUIRE(bf.is_empty());
+  // same for initializing memory in-place
+  size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
+  uint8_t* bytes = new uint8_t[serialized_size_bytes];
+  bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
+  REQUIRE(bf.get_capacity() == adjusted_num_bits);
+  REQUIRE(bf.get_num_hashes() == num_hashes);
+  REQUIRE(bf.get_seed() == seed);
+  REQUIRE(bf.is_empty());
+  bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
+  REQUIRE(bf.get_capacity() == adjusted_num_bits);
+  REQUIRE(bf.get_num_hashes() == num_hashes);
+  REQUIRE(bf.get_seed() == seed);
+  REQUIRE(bf.is_empty());
+  delete [] bytes;
+}
+TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
+  uint64_t num_items = 5000;
+  double fpp = 0.01;
+  uint64_t seed = 4897301548054ULL;
+  auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
+  REQUIRE(bf.is_empty());
+  REQUIRE(bf.get_bits_used() == 0);
+  for (uint64_t i = 0; i < num_items; ++i) {
+    bf.query_and_update(i);
+  }
+  REQUIRE(!bf.is_empty());
+  // filter is about 50% full at target capacity
+  // since seed is fixed we expect an exact value every time
+  // but leaving the approximate test in since that's more the "expectation"
+  REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
+  REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
+  uint32_t num_found = 0;
+  for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
+    if (bf.query(i)) {
+      ++num_found;
+    }
+  }
+  // fpp is average with significant variance -- even at 12% it would fail occasionally
+  REQUIRE(num_found == 423);
+  //REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
+  auto bytes = bf.serialize();
+  // initialize in memory and run the same tests
+  // also checking against the results from the first part
+  uint8_t* bf_memory = new uint8_t[bytes.size()];
+  auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
+  REQUIRE(bf2.is_empty());
+  REQUIRE(bf2.get_bits_used() == 0);
+  for (uint64_t i = 0; i < num_items; ++i) {
+    bf2.query_and_update(i);
+  }
+  REQUIRE(!bf2.is_empty());
+  REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
+  uint32_t num_found2 = 0;
+  for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
+    if (bf2.query(i)) {
+      ++num_found2;
+    }
+  }
+  REQUIRE(num_found == num_found2); // should exactly match above
+  auto bytes2 = bf2.serialize();
+  REQUIRE(bytes.size() == bytes2.size());
+  for (size_t i = 0; i < bytes.size(); ++i) {
+    REQUIRE(bytes[i] == bytes2[i]);
+  }
+  // check that raw memory also matches serialized sketch
+  const uint8_t* bf_bytes = bf2.get_wrapped_memory();
+  REQUIRE(bf_bytes == bf_memory);
+  for (size_t i = 0; i < bytes.size(); ++i) {
+    REQUIRE(bf_bytes[i] == bytes[i]);
+  }
+  // ensure the filters reset properly
+  bf.reset();
+  REQUIRE(bf.is_empty());
+  REQUIRE(bf.get_bits_used() == 0);
+  bf2.reset();
+  REQUIRE(bf2.is_empty());
+  REQUIRE(bf2.get_bits_used() == 0);
+  delete [] bf_memory;
+}
+TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
+  uint64_t num_bits = 8192;
+  uint16_t num_hashes = 3;
+  auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
+  uint64_t n = 500;
+  for (uint64_t i = 0; i < n; ++i) {
+    bf.update(i);
+  }
+  uint64_t num_bits_set = bf.get_bits_used();
+  bf.invert();
+  REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
+  // original items should be mostly not-present
+  uint32_t num_found = 0;
+  for (uint64_t i = 0; i < n; ++i) {
+    if (bf.query(i)) {
+      ++num_found;
+    }
+  }
+  REQUIRE(num_found < n / 10);
+  // many other items should be "present"
+  num_found = 0;
+  for (uint64_t i = n; i < num_bits; ++i) {
+    if (bf.query(i)) {
+      ++num_found;
+    }
+  }
+  REQUIRE(num_found > n);
+}
+TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
+  uint64_t num_bits = 32768;
+  uint16_t num_hashes = 4;
+  auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
+  // mismatched num bits
+  auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
+  REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
+  // mismatched num hashes
+  auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
+  REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
+  // mismatched seed
+  auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
+  REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
+}
+TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
+  const uint64_t num_bits = 12288;
+  const uint16_t num_hashes = 4;
+  auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
+  auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
+  const uint64_t n = 1000;
+  const uint32_t max_item = 3 * n / 2 - 1;
+  for (uint64_t i = 0; i < n; ++i) {
+    bf1.query_and_update(i);
+    bf2.update(n / 2 + i);
+  }
+  bf1.union_with(bf2);
+  for (uint64_t i = 0; i < max_item; ++i) {
+    REQUIRE(bf1.query(i));
+  }
+  uint32_t num_found = 0;
+  for (uint64_t i = max_item; i < num_bits; ++i) {
+    if (bf1.query(i)) {
+      ++num_found;
+    }
+  }
+  REQUIRE(num_found < num_bits / 10); // not being super strict
+}
+TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
+  const uint64_t num_bits = 8192;
+  const uint16_t num_hahes = 5;
+  auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
+  auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
+  const uint64_t n = 1024;
+  const uint32_t max_item = 3 * n / 2 - 1;
+  for (uint64_t i = 0; i < n; ++i) {
+    bf1.update(i);
+    bf2.update(n / 2 + i);
+  }
+  bf1.intersect(bf2);
+  // overlap bit should all be set
+  for (uint64_t i = n / 2; i < n; ++i) {
+    REQUIRE(bf1.query(i));
+  }
+  uint32_t num_found = 0;
+  for (uint64_t i = 0; i < n / 2; ++i) {
+    if (bf1.query(i)) {
+      ++num_found;
+    }
+  }
+  for (uint64_t i = max_item; i < num_bits; ++i) {
+    if (bf1.query(i)) {
+      ++num_found;
+    }
+  }
+  REQUIRE(num_found < num_bits / 10); // not being super strict
+}
+TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
+  const uint64_t num_bits = 32769;
+  const uint16_t num_hashes = 7;
+  auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
+  auto bytes = bf.serialize();
+  REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
+  auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
+  REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
+  REQUIRE(bf.get_seed() == bf_bytes.get_seed());
+  REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
+  REQUIRE(bf_bytes.is_empty());
+  std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+  bf.serialize(ss);
+  auto bf_stream = bloom_filter::deserialize(ss);
+  REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
+  REQUIRE(bf.get_seed() == bf_stream.get_seed());
+  REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
+  REQUIRE(bf_stream.is_empty());
+  // read-only wrap should work
+  auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
+  REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
+  REQUIRE(bf.get_seed() == bf_wrap.get_seed());
+  REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
+  REQUIRE(bf_wrap.is_empty());
+  // writable wrap should not
+  REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
+}
+TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
+  const uint64_t num_bits = 32768;
+  const uint16_t num_hashes = 5;
+  auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
+  const uint64_t n = 1000;
+  for (uint64_t i = 0; i < n; ++i) {
+    bf.update(0.5 + i); // testing floats
+  }
+  // test more items without updating, assuming some false positives
+  // so we can check that we get the same number of false positives
+  // with the same query items
+  uint64_t fp_count = 0;
+  for (uint64_t i = n; i < num_bits; ++i) {
+    fp_count += bf.query(0.5 + i) ? 1 : 0;
+  }
+  auto bytes = bf.serialize();
+  REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
+  auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
+  REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
+  REQUIRE(bf.get_seed() == bf_bytes.get_seed());
+  REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
+  REQUIRE(!bf_bytes.is_empty());
+  REQUIRE(bf.is_memory_owned());
+  uint64_t fp_count_bytes = 0;
+  for (uint64_t i = 0; i < num_bits; ++i) {
+    bool val = bf_bytes.query(0.5 + i);
+    if (i < n)
+      REQUIRE(val);
+    else if (val)
+      ++fp_count_bytes;
+  }
+  REQUIRE(fp_count_bytes == fp_count);
+  std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
+  bf.serialize(ss);
+  auto bf_stream = bloom_filter::deserialize(ss);
+  REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
+  REQUIRE(bf.get_seed() == bf_stream.get_seed());
+  REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
+  REQUIRE(!bf_stream.is_empty());
+  REQUIRE(bf_stream.is_memory_owned());
+  uint64_t fp_count_stream = 0;
+  for (uint64_t i = 0; i < num_bits; ++i) {
+    bool val = bf_stream.query(0.5 + i);
+    if (i < n)
+      REQUIRE(val);
+    else if (val)
+      ++fp_count_stream;
+  }
+  REQUIRE(fp_count_stream == fp_count);
+  // read-only wrap
+  auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
+  REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
+  REQUIRE(bf.get_seed() == bf_wrap.get_seed());
+  REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
+  REQUIRE(!bf_wrap.is_empty());
+  REQUIRE(!bf_wrap.is_memory_owned());
+  uint64_t fp_count_wrap = 0;
+  for (uint64_t i = 0; i < num_bits; ++i) {
+    bool val = bf_wrap.query(0.5 + i);
+    if (i < n)
+      REQUIRE(val);
+    else if (val)
+      ++fp_count_wrap;
+  }
+  REQUIRE(fp_count_wrap == fp_count);
+  REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
+  REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
+  REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
+  // writable wrap
+  auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
+  REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
+  REQUIRE(bf.get_seed() == bf_writable.get_seed());
+  REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
+  REQUIRE(!bf_writable.is_empty());
+  REQUIRE(!bf_writable.is_memory_owned());
+  uint64_t fp_count_writable = 0;
+  for (uint64_t i = 0; i < num_bits; ++i) {
+    bool val = bf_writable.query(0.5 + i);
+    if (i < n)
+      REQUIRE(val);
+    else if (val)
+      ++fp_count_writable;
+  }
+  REQUIRE(fp_count_writable == fp_count);
+  REQUIRE(!bf_writable.query(-1.0));
+  bf_writable.update(-1.0);
+  REQUIRE(bf_writable.query(-1.0));
+  // not good memory management to do this, but because we wrapped the same bytes as both
+  // read-only adn writable, that update should ahve changed the read-only version, too
+  REQUIRE(bf_wrap.query(-1.0));
+}
+} // namespace datasketches

data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp CHANGED Viewed

@@ -89,6 +89,7 @@ public:
   using vector_t = std::vector<T, Allocator>;
   using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
   using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
+  using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
   struct centroid_cmp {
     centroid_cmp() {}
@@ -115,7 +116,7 @@ public:
    * Merge the given t-Digest into this one
    * @param other t-Digest to merge
    */
-  void merge(tdigest& other);
+  void merge(const tdigest& other);
   /**
    * Process buffered values and merge centroids if needed
@@ -142,8 +143,17 @@ public:
    */
   uint64_t get_total_weight() const;
+  /**
+   * Returns an instance of the allocator for this t-Digest.
+   * @return allocator
+   */
+  Allocator get_allocator() const;
   /**
    * Compute approximate normalized rank of the given value.
+   *
+   * <p>If the sketch is empty this throws std::runtime_error.
+   *
    * @param value to be ranked
    * @return normalized rank (from 0 to 1 inclusive)
    */
@@ -151,11 +161,49 @@ public:
   /**
    * Compute approximate quantile value corresponding to the given normalized rank
+   *
+   * <p>If the sketch is empty this throws std::runtime_error.
+   *
    * @param rank normalized rank (from 0 to 1 inclusive)
    * @return quantile value corresponding to the given rank
    */
   T get_quantile(double rank) const;
+  /**
+   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+   * given a set of split points.
+   *
+   * <p>If the sketch is empty this throws std::runtime_error.
+   *
+   * @param split_points an array of <i>m</i> unique, monotonically increasing values
+   * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
+   *
+   * @param size the number of split points in the array
+   *
+   * @return an array of m+1 doubles each of which is an approximation
+   * to the fraction of the input stream values (the mass) that fall into one of those intervals.
+   */
+  vector_double get_PMF(const T* split_points, uint32_t size) const;
+  /**
+   * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+   * cumulative analog of the PMF, of the input stream given a set of split points.
+   *
+   * <p>If the sketch is empty this throws std::runtime_error.
+   *
+   * @param split_points an array of <i>m</i> unique, monotonically increasing values
+   * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
+   *
+   * @param size the number of split points in the array
+   *
+   * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
+   * of the input stream given the split_points. The value at array position j of the returned
+   * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
+   * array. This can be viewed as array of ranks of the given split points plus one more value
+   * that is always 1.
+   */
+  vector_double get_CDF(const T* split_points, uint32_t size) const;
   /**
    * @return parameter k (compression) that was used to configure this t-Digest
    */
@@ -245,6 +293,8 @@ private:
   // for compatibility with format of the reference implementation
   static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
   static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
+  static inline void check_split_points(const T* values, uint32_t size);
 };
 } /* namespace datasketches */

data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp CHANGED Viewed

@@ -20,6 +20,7 @@
 #ifndef _TDIGEST_IMPL_HPP_
 #define _TDIGEST_IMPL_HPP_
+#include <algorithm>
 #include <cmath>
 #include <sstream>
@@ -43,7 +44,7 @@ void tdigest<T, A>::update(T value) {
 }
 template<typename T, typename A>
-void tdigest<T, A>::merge(tdigest& other) {
+void tdigest<T, A>::merge(const tdigest& other) {
   if (other.is_empty()) return;
   vector_centroid tmp(buffer_.get_allocator());
   tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
@@ -84,6 +85,11 @@ uint64_t tdigest<T, A>::get_total_weight() const {
   return centroids_weight_ + buffer_.size();
 }
+template<typename T, typename A>
+A tdigest<T, A>::get_allocator() const {
+  return buffer_.get_allocator();
+}
 template<typename T, typename A>
 double tdigest<T, A>::get_rank(T value) const {
   if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
@@ -190,6 +196,25 @@ T tdigest<T, A>::get_quantile(double rank) const {
   return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
 }
+template<typename T, typename A>
+auto tdigest<T, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
+  auto buckets = get_CDF(split_points, size);
+  for (uint32_t i = size; i > 0; --i) {
+    buckets[i] -= buckets[i - 1];
+  }
+  return buckets;
+}
+template<typename T, typename A>
+auto tdigest<T, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
+  check_split_points(split_points, size);
+  vector_double ranks(get_allocator());
+  ranks.reserve(size + 1);
+  for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i]));
+  ranks.push_back(1);
+  return ranks;
+}
 template<typename T, typename A>
 uint16_t tdigest<T, A>::get_k() const {
   return k_;
@@ -590,6 +615,18 @@ buffer_(std::move(buffer))
   buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
 }
+template<typename T, typename A>
+void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
+  for (uint32_t i = 0; i < size ; i++) {
+    if (std::isnan(values[i])) {
+      throw std::invalid_argument("Values must not be NaN");
+    }
+    if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
+      throw std::invalid_argument("Values must be unique and monotonically increasing");
+    }
+  }
+}
 } /* namespace datasketches */
 #endif // _TDIGEST_IMPL_HPP_

data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp CHANGED Viewed

@@ -35,6 +35,9 @@ TEST_CASE("empty", "[tdigest]") {
   REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
   REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
   REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
+  const double split_points[1] {0};
+  REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
+  REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
 }
 TEST_CASE("one value", "[tdigest]") {
@@ -56,9 +59,6 @@ TEST_CASE("many values", "[tdigest]") {
   const size_t n = 10000;
   tdigest_double td;
   for (size_t i = 0; i < n; ++i) td.update(i);
-//  std::cout << td.to_string(true);
-//  td.compress();
-//  std::cout << td.to_string(true);
   REQUIRE_FALSE(td.is_empty());
   REQUIRE(td.get_total_weight() == n);
   REQUIRE(td.get_min_value() == 0);
@@ -73,6 +73,15 @@ TEST_CASE("many values", "[tdigest]") {
   REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
   REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
   REQUIRE(td.get_quantile(1) == n - 1);
+  const double split_points[1] {n / 2};
+  const auto pmf = td.get_PMF(split_points, 1);
+  REQUIRE(pmf.size() == 2);
+  REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
+  REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
+  const auto cdf = td.get_CDF(split_points, 1);
+  REQUIRE(cdf.size() == 2);
+  REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
+  REQUIRE(cdf[1] == 1);
 }
 TEST_CASE("rank - two values", "[tdigest]") {

data/vendor/datasketches-cpp/theta/include/bit_packing.hpp CHANGED Viewed

@@ -329,7 +329,7 @@ static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) {
   *ptr++ = static_cast<uint8_t>(values[3] >> 4);
-  *ptr = static_cast<uint8_t>(values[3] >> 4);
+  *ptr = static_cast<uint8_t>(values[3] << 4);
   *ptr++ |= static_cast<uint8_t>(values[4] >> 9);
   *ptr++ = static_cast<uint8_t>(values[4] >> 1);
@@ -4227,7 +4227,7 @@ static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) {
   values[6] |= *ptr >> 1;
   values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
-  values[7] |= *ptr++ << 24;
+  values[7] |= static_cast<uint64_t>(*ptr++) << 24;
   values[7] |= *ptr++ << 16;
   values[7] |= *ptr++ << 8;
   values[7] |= *ptr;
@@ -4296,7 +4296,7 @@ static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) {
   values[1] |= *ptr++ << 6;
   values[1] |= *ptr >> 2;
-  values[2] = static_cast<uint64_t>(*ptr++ & 2) << 33;
+  values[2] = static_cast<uint64_t>(*ptr++ & 3) << 33;
   values[2] |= static_cast<uint64_t>(*ptr++) << 25;
   values[2] |= *ptr++ << 17;
   values[2] |= *ptr++ << 9;
@@ -6201,7 +6201,7 @@ static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_
     case 61: pack_bits_61(values, ptr); break;
     case 62: pack_bits_62(values, ptr); break;
     case 63: pack_bits_63(values, ptr); break;
-    default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
+    default: throw std::logic_error("wrong number of bits in pack_bits_block8: " + std::to_string(bits));
   }
 }
@@ -6270,7 +6270,7 @@ static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint
     case 61: unpack_bits_61(values, ptr); break;
     case 62: unpack_bits_62(values, ptr); break;
     case 63: unpack_bits_63(values, ptr); break;
-    default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
+    default: throw std::logic_error("wrong number of bits in unpack_bits_block8: " + std::to_string(bits));
   }
 }