RubyGems - datasketches - Versions diffs - 0.1.2 → 0.2.0 - Mend

datasketches 0.1.2 → 0.2.0

Files changed (160) hide show

data/ext/datasketches/vo_wrapper.cpp CHANGED Viewed

@@ -2,9 +2,7 @@
 #include <var_opt_sketch.hpp>
-#include <rice/Array.hpp>
-#include <rice/Constructor.hpp>
-#include <rice/Module.hpp>
+#include "ext.h"
 using datasketches::var_opt_sketch;
@@ -19,7 +17,7 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
     .define_method("reset", &var_opt_sketch<T>::reset)
     .define_method(
       "samples",
-      *[](var_opt_sketch<T>& self) {
+      [](var_opt_sketch<T>& self) {
         auto a = Rice::Array();
         for (auto item : self) {
           auto t = Rice::Array();
@@ -31,9 +29,9 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
       })
     .define_method(
       "update",
-      *[](var_opt_sketch<T>& self, const T item) {
+      [](var_opt_sketch<T>& self, const T item) {
         self.update(item);
-      });
+      }, Rice::Arg("item").keepAlive());
 }
 void init_vo(Rice::Module& m) {

data/lib/datasketches/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module DataSketches
-  VERSION = "0.1.2"
+  VERSION = "0.2.0"
 end

data/vendor/datasketches-cpp/CMakeLists.txt CHANGED Viewed

@@ -96,6 +96,7 @@ add_subdirectory(fi)
 add_subdirectory(theta)
 add_subdirectory(sampling)
 add_subdirectory(tuple)
+add_subdirectory(req)
 if (WITH_PYTHON)
   add_subdirectory(python)

data/vendor/datasketches-cpp/README.md CHANGED Viewed

@@ -1,18 +1,18 @@
-# DataSketches Core C++ Library Component
-This is the core C++ component of the DataSketches library.  It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
+# Apache DataSketches Core C++ Library Component
+This is the core C++ component of the Apache DataSketches library.  It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
 This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
 Note that we have a parallel core component for Java implementations of the same sketch algorithms,
 [datasketches-java](https://github.com/apache/datasketches-java).
-Please visit the main [DataSketches website](https://datasketches.apache.org) for more information.
+Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information.
 If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us.
 ---
-This code requires C++11. It was tested with GCC 4.8.5 (standard in RedHat at the time of this writing), GCC 8.2.0 and Apple LLVM version 10.0.1 (clang-1001.0.46.4)
+This code requires C++11.
 This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).

data/vendor/datasketches-cpp/common/include/MurmurHash3.h CHANGED Viewed

@@ -3,6 +3,7 @@
 //  * Changed input seed in MurmurHash3_x64_128 to uint64_t
 //  * Define and use HashState reference to return result
 //  * Made entire hash function defined inline
+//  * Added compute_seed_hash
 //-----------------------------------------------------------------------------
 // MurmurHash3 was written by Austin Appleby, and is placed in the public
 // domain. The author hereby disclaims copyright to this source code.
@@ -170,4 +171,10 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
 //-----------------------------------------------------------------------------
+FORCE_INLINE uint16_t compute_seed_hash(uint64_t seed) {
+  HashState hashes;
+  MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
+  return static_cast<uint16_t>(hashes.h1 & 0xffff);
+}
 #endif // _MURMURHASH3_H_

data/vendor/datasketches-cpp/common/include/memory_operations.hpp CHANGED Viewed

@@ -52,6 +52,18 @@ static inline size_t copy_to_mem(const void* src, void* dst, size_t size) {
   return size;
 }
+template<typename T>
+static inline size_t copy_to_mem(const T& item, void* dst) {
+  memcpy(dst, &item, sizeof(T));
+  return sizeof(T);
+}
+template<typename T>
+static inline size_t copy_from_mem(const void* src, T& item) {
+  memcpy(&item, src, sizeof(T));
+  return sizeof(T);
+}
 } // namespace
 #endif // _MEMORY_OPERATIONS_HPP_

data/vendor/datasketches-cpp/common/test/CMakeLists.txt CHANGED Viewed

@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
+# two parts here, the common test code for other parts to use,
+# and an integration test using the other parts of the library.
+# common dependencies for tests
 add_library(common_test OBJECT "")
 set_target_properties(common_test PROPERTIES
@@ -36,3 +40,23 @@ target_sources(common_test
     ${CMAKE_CURRENT_SOURCE_DIR}/catch_runner.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
 )
+# now the integration test part
+add_executable(integration_test)
+target_link_libraries(integration_test cpc fi hll kll req sampling theta tuple common_test)
+set_target_properties(integration_test PROPERTIES
+  CXX_STANDARD 11
+  CXX_STANDARD_REQUIRED YES
+)
+add_test(
+  NAME integration_test
+  COMMAND integration_test
+)
+target_sources(integration_test
+  PRIVATE
+    integration_test.cpp
+)

data/vendor/datasketches-cpp/common/test/integration_test.cpp ADDED Viewed

@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <catch.hpp>
+#include "cpc_sketch.hpp"
+#include "cpc_union.hpp"
+#include "frequent_items_sketch.hpp"
+#include "hll.hpp"
+#include "kll_sketch.hpp"
+#include "req_sketch.hpp"
+#include "var_opt_sketch.hpp"
+#include "var_opt_union.hpp"
+#include "theta_sketch.hpp"
+#include "theta_union.hpp"
+#include "theta_intersection.hpp"
+#include "theta_a_not_b.hpp"
+#include "tuple_sketch.hpp"
+#include "tuple_union.hpp"
+#include "tuple_intersection.hpp"
+#include "tuple_a_not_b.hpp"
+namespace datasketches {
+template<typename Summary>
+struct subtracting_intersection_policy {
+  void operator()(Summary& summary, const Summary& other) const {
+    summary -= other;
+  }
+};
+using tuple_intersection_float = tuple_intersection<float, subtracting_intersection_policy<float>>;
+TEST_CASE("integration: declare all sketches", "[integration]") {
+  cpc_sketch cpc(12);
+  cpc_union cpc_u(12);
+  frequent_items_sketch<std::string> fi(100);
+  hll_sketch hll(13);
+  hll_union hll_u(13);
+  kll_sketch<double> kll(200);
+  req_sketch<double> req(12);
+  var_opt_sketch<std::string> vo(100);
+  var_opt_union<std::string> vo_u(100);
+  update_theta_sketch theta = update_theta_sketch::builder().build();
+  theta_union theta_u = theta_union::builder().build();
+  theta_intersection theta_i;
+  theta_a_not_b theta_anb;
+  auto tuple = update_tuple_sketch<float>::builder().build();
+  auto tuple_u = tuple_union<float>::builder().build();
+  tuple_intersection_float tuple_i;
+  tuple_a_not_b<float> tuple_anb;
+}
+} /* namespace datasketches */

data/vendor/datasketches-cpp/common/test/test_allocator.hpp CHANGED Viewed

@@ -22,6 +22,7 @@
 #include <new>
 #include <utility>
+#include <stdexcept>
 // this allocator keeps the total allocated size in a global variable for testing
@@ -43,7 +44,14 @@ public:
   template <class U>
   struct rebind { typedef test_allocator<U> other; };
-  test_allocator() {}
+  // this is to test that a given instance of an allocator is used instead of instantiating
+  static const bool DISALLOW_DEFAULT_CONSTRUCTOR = true;
+  test_allocator() {
+    if (DISALLOW_DEFAULT_CONSTRUCTOR) throw std::runtime_error("test_allocator: default constructor");
+  }
+  // call this constructor in tests and pass an allocator instance
+  test_allocator(int) {}
   test_allocator(const test_allocator&) {}
   template <class U>
   test_allocator(const test_allocator<U>&) {}

data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp CHANGED Viewed

@@ -44,6 +44,8 @@ template<typename A> class u32_table;
 template<typename A>
 struct compressed_state {
+  explicit compressed_state(const A& allocator): table_data(allocator), table_data_words(0), table_num_entries(0),
+      window_data(allocator), window_data_words(0) {}
   vector_u32<A> table_data;
   uint32_t table_data_words;
   uint32_t table_num_entries; // can be different from the number of entries in the sketch in hybrid mode
@@ -53,6 +55,7 @@ struct compressed_state {
 template<typename A>
 struct uncompressed_state {
+  explicit uncompressed_state(const A& allocator): table(allocator), window(allocator) {}
   u32_table<A> table;
   vector_u8<A> window;
 };

data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp CHANGED Viewed

@@ -129,14 +129,14 @@ private:
   void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
   void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
-  vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const;
+  vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k, const A& allocator) const;
   void uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
   static size_t safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits);
   static size_t safe_length_for_compressed_window_buf(uint64_t k);
   static uint8_t determine_pseudo_phase(uint8_t lg_k, uint64_t c);
-  static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space);
+  static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
   static inline uint64_t golomb_choose_number_of_base_bits(uint64_t k, uint64_t count);
 };

data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp CHANGED Viewed

@@ -160,7 +160,7 @@ template<typename A>
 void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const {
   switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
     case cpc_sketch_alloc<A>::flavor::EMPTY:
-      target.table = u32_table<A>(2, 6 + lg_k);
+      target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
       break;
     case cpc_sketch_alloc<A>::flavor::SPARSE:
       uncompress_sparse_flavor(source, target, lg_k);
@@ -191,8 +191,9 @@ template<typename A>
 void cpc_compressor<A>::uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
   if (source.window_data.size() > 0) throw std::logic_error("unexpected sliding window");
   if (source.table_data.size() == 0) throw std::logic_error("table is expected");
-  vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries, lg_k);
-  target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k);
+  vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
+      lg_k, source.table_data.get_allocator());
+  target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k, pairs.get_allocator());
 }
 // This is complicated because it effectively builds a Sparse version
@@ -206,7 +207,7 @@ void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source
   if (pairs_from_table.size() > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, pairs_from_table.size());
   const size_t num_pairs_from_window = source.get_num_coupons() - pairs_from_table.size(); // because the window offset is zero
-  vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size());
+  vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size(), source.get_allocator());
   u32_table<A>::merge(
       pairs_from_table.data(), 0, pairs_from_table.size(),
@@ -221,7 +222,8 @@ template<typename A>
 void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
   if (source.window_data.size() > 0) throw std::logic_error("window is not expected");
   if (source.table_data.size() == 0) throw std::logic_error("table is expected");
-  vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries, lg_k);
+  vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
+      lg_k, source.table_data.get_allocator());
   // In the hybrid flavor, some of these pairs actually
   // belong in the window, so we will separate them out,
@@ -240,7 +242,7 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
       pairs[next_true_pair++] = row_col; // move true pair down
     }
   }
-  target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k);
+  target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k, pairs.get_allocator());
 }
 template<typename A>
@@ -264,21 +266,23 @@ void cpc_compressor<A>::compress_pinned_flavor(const cpc_sketch_alloc<A>& source
 }
 template<typename A>
-void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
+void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
+    uint8_t lg_k, uint32_t num_coupons) const {
   if (source.window_data.size() == 0) throw std::logic_error("window is expected");
   uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
   const size_t num_pairs = source.table_num_entries;
   if (num_pairs == 0) {
-    target.table = u32_table<A>(2, 6 + lg_k);
+    target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
   } else {
     if (source.table_data.size() == 0) throw std::logic_error("table is expected");
-    vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs, lg_k);
+    vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
+        lg_k, source.table_data.get_allocator());
     // undo the compressor's 8-column shift
     for (size_t i = 0; i < num_pairs; i++) {
       if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
       pairs[i] += 8;
     }
-    target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
+    target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
   }
 }
@@ -314,15 +318,17 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
 }
 template<typename A>
-void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
+void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
+    uint8_t lg_k, uint32_t num_coupons) const {
   if (source.window_data.size() == 0) throw std::logic_error("window is expected");
   uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
   const size_t num_pairs = source.table_num_entries;
   if (num_pairs == 0) {
-    target.table = u32_table<A>(2, 6 + lg_k);
+    target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
   } else {
     if (source.table_data.size() == 0) throw std::logic_error("table is expected");
-    vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs, lg_k);
+    vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
+        lg_k, source.table_data.get_allocator());
     const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
     if (pseudo_phase >= 16) throw std::logic_error("pseudo phase >= 16");
@@ -342,7 +348,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
       pairs[i] = (row << 6) | col;
     }
-    target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
+    target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
   }
 }
@@ -364,9 +370,10 @@ void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, u
 }
 template<typename A>
-vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const {
+vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
+    uint8_t lg_k, const A& allocator) const {
   const size_t k = 1 << lg_k;
-  vector_u32<A> pairs(num_pairs);
+  vector_u32<A> pairs(num_pairs, 0, allocator);
   const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
   low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
   return pairs;
@@ -388,7 +395,8 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
 }
 template<typename A>
-void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const {
+void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
+    uint8_t lg_k, uint32_t num_coupons) const {
   const size_t k = 1 << lg_k;
   window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
   const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
@@ -710,9 +718,10 @@ void write_unary(
 // The empty space that this leaves at the beginning of the output array
 // will be filled in later by the caller.
 template<typename A>
-vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space) {
+vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
+    uint32_t empty_space, const A& allocator) {
   const size_t output_length = empty_space + num_pairs_to_get;
-  vector_u32<A> pairs(output_length);
+  vector_u32<A> pairs(output_length, 0, allocator);
   size_t pair_index = empty_space;
   for (unsigned row_index = 0; row_index < k; row_index++) {
     uint8_t byte = window[row_index];

data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp CHANGED Viewed

@@ -49,7 +49,7 @@ template<typename A> class cpc_sketch_alloc;
 template<typename A> class cpc_union_alloc;
 // alias with default allocator for convenience
-typedef cpc_sketch_alloc<std::allocator<void>> cpc_sketch;
+using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
 // allocation and initialization of global decompression (decoding) tables
 // call this before anything else if you want to control the initialization time
@@ -67,7 +67,10 @@ public:
    * @param lg_k base 2 logarithm of the number of bins in the sketch
    * @param seed for hash function
    */
-  explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
+  explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
+  using allocator_type = A;
+  A get_allocator() const;
   /**
    * @return configured lg_k of this sketch
@@ -204,7 +207,7 @@ public:
   // This is a convenience alias for users
   // The type returned by the following serialize method
-  typedef vector_u8<A> vector_bytes;
+  using vector_bytes = vector_u8<A>;
   /**
    * This method serializes the sketch as a vector of bytes.
@@ -221,7 +224,7 @@ public:
    * @param seed the seed for the hash function that was used to create the sketch
    * @return an instance of a sketch
    */
-  static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
+  static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
   /**
    * This method deserializes a sketch from a given array of bytes.
@@ -230,7 +233,7 @@ public:
    * @param seed the seed for the hash function that was used to create the sketch
    * @return an instance of the sketch
    */
-  static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
+  static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
   // for internal use
   uint32_t get_num_coupons() const;