datasketches 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -2,9 +2,7 @@
2
2
 
3
3
  #include <var_opt_sketch.hpp>
4
4
 
5
- #include <rice/Array.hpp>
6
- #include <rice/Constructor.hpp>
7
- #include <rice/Module.hpp>
5
+ #include "ext.h"
8
6
 
9
7
  using datasketches::var_opt_sketch;
10
8
 
@@ -19,7 +17,7 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
19
17
  .define_method("reset", &var_opt_sketch<T>::reset)
20
18
  .define_method(
21
19
  "samples",
22
- *[](var_opt_sketch<T>& self) {
20
+ [](var_opt_sketch<T>& self) {
23
21
  auto a = Rice::Array();
24
22
  for (auto item : self) {
25
23
  auto t = Rice::Array();
@@ -31,9 +29,9 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
31
29
  })
32
30
  .define_method(
33
31
  "update",
34
- *[](var_opt_sketch<T>& self, const T item) {
32
+ [](var_opt_sketch<T>& self, const T item) {
35
33
  self.update(item);
36
- });
34
+ }, Rice::Arg("item").keepAlive());
37
35
  }
38
36
 
39
37
  void init_vo(Rice::Module& m) {
@@ -1,3 +1,3 @@
1
1
  module DataSketches
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -96,6 +96,7 @@ add_subdirectory(fi)
96
96
  add_subdirectory(theta)
97
97
  add_subdirectory(sampling)
98
98
  add_subdirectory(tuple)
99
+ add_subdirectory(req)
99
100
 
100
101
  if (WITH_PYTHON)
101
102
  add_subdirectory(python)
@@ -1,18 +1,18 @@
1
- # DataSketches Core C++ Library Component
2
- This is the core C++ component of the DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
1
+ # Apache DataSketches Core C++ Library Component
2
+ This is the core C++ component of the Apache DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
3
3
 
4
4
  This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
5
5
 
6
6
  Note that we have a parallel core component for Java implementations of the same sketch algorithms,
7
7
  [datasketches-java](https://github.com/apache/datasketches-java).
8
8
 
9
- Please visit the main [DataSketches website](https://datasketches.apache.org) for more information.
9
+ Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information.
10
10
 
11
11
  If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us.
12
12
 
13
13
  ---
14
14
 
15
- This code requires C++11. It was tested with GCC 4.8.5 (standard in RedHat at the time of this writing), GCC 8.2.0 and Apple LLVM version 10.0.1 (clang-1001.0.46.4)
15
+ This code requires C++11.
16
16
 
17
17
  This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
18
18
 
@@ -3,6 +3,7 @@
3
3
  // * Changed input seed in MurmurHash3_x64_128 to uint64_t
4
4
  // * Define and use HashState reference to return result
5
5
  // * Made entire hash function defined inline
6
+ // * Added compute_seed_hash
6
7
  //-----------------------------------------------------------------------------
7
8
  // MurmurHash3 was written by Austin Appleby, and is placed in the public
8
9
  // domain. The author hereby disclaims copyright to this source code.
@@ -170,4 +171,10 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
170
171
 
171
172
  //-----------------------------------------------------------------------------
172
173
 
174
+ FORCE_INLINE uint16_t compute_seed_hash(uint64_t seed) {
175
+ HashState hashes;
176
+ MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
177
+ return static_cast<uint16_t>(hashes.h1 & 0xffff);
178
+ }
179
+
173
180
  #endif // _MURMURHASH3_H_
@@ -52,6 +52,18 @@ static inline size_t copy_to_mem(const void* src, void* dst, size_t size) {
52
52
  return size;
53
53
  }
54
54
 
55
+ template<typename T>
56
+ static inline size_t copy_to_mem(const T& item, void* dst) {
57
+ memcpy(dst, &item, sizeof(T));
58
+ return sizeof(T);
59
+ }
60
+
61
+ template<typename T>
62
+ static inline size_t copy_from_mem(const void* src, T& item) {
63
+ memcpy(&item, src, sizeof(T));
64
+ return sizeof(T);
65
+ }
66
+
55
67
  } // namespace
56
68
 
57
69
  #endif // _MEMORY_OPERATIONS_HPP_
@@ -15,6 +15,10 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ # two parts here, the common test code for other parts to use,
19
+ # and an integration test using the other parts of the library.
20
+
21
+ # common dependencies for tests
18
22
  add_library(common_test OBJECT "")
19
23
 
20
24
  set_target_properties(common_test PROPERTIES
@@ -36,3 +40,23 @@ target_sources(common_test
36
40
  ${CMAKE_CURRENT_SOURCE_DIR}/catch_runner.cpp
37
41
  ${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
38
42
  )
43
+
44
+ # now the integration test part
45
+ add_executable(integration_test)
46
+
47
+ target_link_libraries(integration_test cpc fi hll kll req sampling theta tuple common_test)
48
+
49
+ set_target_properties(integration_test PROPERTIES
50
+ CXX_STANDARD 11
51
+ CXX_STANDARD_REQUIRED YES
52
+ )
53
+
54
+ add_test(
55
+ NAME integration_test
56
+ COMMAND integration_test
57
+ )
58
+
59
+ target_sources(integration_test
60
+ PRIVATE
61
+ integration_test.cpp
62
+ )
@@ -0,0 +1,77 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include "cpc_sketch.hpp"
23
+ #include "cpc_union.hpp"
24
+ #include "frequent_items_sketch.hpp"
25
+ #include "hll.hpp"
26
+ #include "kll_sketch.hpp"
27
+ #include "req_sketch.hpp"
28
+ #include "var_opt_sketch.hpp"
29
+ #include "var_opt_union.hpp"
30
+ #include "theta_sketch.hpp"
31
+ #include "theta_union.hpp"
32
+ #include "theta_intersection.hpp"
33
+ #include "theta_a_not_b.hpp"
34
+ #include "tuple_sketch.hpp"
35
+ #include "tuple_union.hpp"
36
+ #include "tuple_intersection.hpp"
37
+ #include "tuple_a_not_b.hpp"
38
+
39
+ namespace datasketches {
40
+
41
+ template<typename Summary>
42
+ struct subtracting_intersection_policy {
43
+ void operator()(Summary& summary, const Summary& other) const {
44
+ summary -= other;
45
+ }
46
+ };
47
+
48
+ using tuple_intersection_float = tuple_intersection<float, subtracting_intersection_policy<float>>;
49
+
50
+ TEST_CASE("integration: declare all sketches", "[integration]") {
51
+ cpc_sketch cpc(12);
52
+ cpc_union cpc_u(12);
53
+
54
+ frequent_items_sketch<std::string> fi(100);
55
+
56
+ hll_sketch hll(13);
57
+ hll_union hll_u(13);
58
+
59
+ kll_sketch<double> kll(200);
60
+
61
+ req_sketch<double> req(12);
62
+
63
+ var_opt_sketch<std::string> vo(100);
64
+ var_opt_union<std::string> vo_u(100);
65
+
66
+ update_theta_sketch theta = update_theta_sketch::builder().build();
67
+ theta_union theta_u = theta_union::builder().build();
68
+ theta_intersection theta_i;
69
+ theta_a_not_b theta_anb;
70
+
71
+ auto tuple = update_tuple_sketch<float>::builder().build();
72
+ auto tuple_u = tuple_union<float>::builder().build();
73
+ tuple_intersection_float tuple_i;
74
+ tuple_a_not_b<float> tuple_anb;
75
+ }
76
+
77
+ } /* namespace datasketches */
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <new>
24
24
  #include <utility>
25
+ #include <stdexcept>
25
26
 
26
27
  // this allocator keeps the total allocated size in a global variable for testing
27
28
 
@@ -43,7 +44,14 @@ public:
43
44
  template <class U>
44
45
  struct rebind { typedef test_allocator<U> other; };
45
46
 
46
- test_allocator() {}
47
+ // this is to test that a given instance of an allocator is used instead of instantiating
48
+ static const bool DISALLOW_DEFAULT_CONSTRUCTOR = true;
49
+ test_allocator() {
50
+ if (DISALLOW_DEFAULT_CONSTRUCTOR) throw std::runtime_error("test_allocator: default constructor");
51
+ }
52
+ // call this constructor in tests and pass an allocator instance
53
+ test_allocator(int) {}
54
+
47
55
  test_allocator(const test_allocator&) {}
48
56
  template <class U>
49
57
  test_allocator(const test_allocator<U>&) {}
@@ -44,6 +44,8 @@ template<typename A> class u32_table;
44
44
 
45
45
  template<typename A>
46
46
  struct compressed_state {
47
+ explicit compressed_state(const A& allocator): table_data(allocator), table_data_words(0), table_num_entries(0),
48
+ window_data(allocator), window_data_words(0) {}
47
49
  vector_u32<A> table_data;
48
50
  uint32_t table_data_words;
49
51
  uint32_t table_num_entries; // can be different from the number of entries in the sketch in hybrid mode
@@ -53,6 +55,7 @@ struct compressed_state {
53
55
 
54
56
  template<typename A>
55
57
  struct uncompressed_state {
58
+ explicit uncompressed_state(const A& allocator): table(allocator), window(allocator) {}
56
59
  u32_table<A> table;
57
60
  vector_u8<A> window;
58
61
  };
@@ -129,14 +129,14 @@ private:
129
129
  void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
130
130
  void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
131
131
 
132
- vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const;
132
+ vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k, const A& allocator) const;
133
133
  void uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
134
134
 
135
135
  static size_t safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits);
136
136
  static size_t safe_length_for_compressed_window_buf(uint64_t k);
137
137
  static uint8_t determine_pseudo_phase(uint8_t lg_k, uint64_t c);
138
138
 
139
- static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space);
139
+ static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
140
140
  static inline uint64_t golomb_choose_number_of_base_bits(uint64_t k, uint64_t count);
141
141
  };
142
142
 
@@ -160,7 +160,7 @@ template<typename A>
160
160
  void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const {
161
161
  switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
162
162
  case cpc_sketch_alloc<A>::flavor::EMPTY:
163
- target.table = u32_table<A>(2, 6 + lg_k);
163
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
164
164
  break;
165
165
  case cpc_sketch_alloc<A>::flavor::SPARSE:
166
166
  uncompress_sparse_flavor(source, target, lg_k);
@@ -191,8 +191,9 @@ template<typename A>
191
191
  void cpc_compressor<A>::uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
192
192
  if (source.window_data.size() > 0) throw std::logic_error("unexpected sliding window");
193
193
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
194
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries, lg_k);
195
- target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k);
194
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
195
+ lg_k, source.table_data.get_allocator());
196
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k, pairs.get_allocator());
196
197
  }
197
198
 
198
199
  // This is complicated because it effectively builds a Sparse version
@@ -206,7 +207,7 @@ void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source
206
207
  if (pairs_from_table.size() > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, pairs_from_table.size());
207
208
  const size_t num_pairs_from_window = source.get_num_coupons() - pairs_from_table.size(); // because the window offset is zero
208
209
 
209
- vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size());
210
+ vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size(), source.get_allocator());
210
211
 
211
212
  u32_table<A>::merge(
212
213
  pairs_from_table.data(), 0, pairs_from_table.size(),
@@ -221,7 +222,8 @@ template<typename A>
221
222
  void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
222
223
  if (source.window_data.size() > 0) throw std::logic_error("window is not expected");
223
224
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
224
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries, lg_k);
225
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
226
+ lg_k, source.table_data.get_allocator());
225
227
 
226
228
  // In the hybrid flavor, some of these pairs actually
227
229
  // belong in the window, so we will separate them out,
@@ -240,7 +242,7 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
240
242
  pairs[next_true_pair++] = row_col; // move true pair down
241
243
  }
242
244
  }
243
- target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k);
245
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k, pairs.get_allocator());
244
246
  }
245
247
 
246
248
  template<typename A>
@@ -264,21 +266,23 @@ void cpc_compressor<A>::compress_pinned_flavor(const cpc_sketch_alloc<A>& source
264
266
  }
265
267
 
266
268
  template<typename A>
267
- void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
269
+ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
270
+ uint8_t lg_k, uint32_t num_coupons) const {
268
271
  if (source.window_data.size() == 0) throw std::logic_error("window is expected");
269
272
  uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
270
273
  const size_t num_pairs = source.table_num_entries;
271
274
  if (num_pairs == 0) {
272
- target.table = u32_table<A>(2, 6 + lg_k);
275
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
273
276
  } else {
274
277
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
275
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs, lg_k);
278
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
279
+ lg_k, source.table_data.get_allocator());
276
280
  // undo the compressor's 8-column shift
277
281
  for (size_t i = 0; i < num_pairs; i++) {
278
282
  if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
279
283
  pairs[i] += 8;
280
284
  }
281
- target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
285
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
282
286
  }
283
287
  }
284
288
 
@@ -314,15 +318,17 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
314
318
  }
315
319
 
316
320
  template<typename A>
317
- void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
321
+ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
322
+ uint8_t lg_k, uint32_t num_coupons) const {
318
323
  if (source.window_data.size() == 0) throw std::logic_error("window is expected");
319
324
  uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
320
325
  const size_t num_pairs = source.table_num_entries;
321
326
  if (num_pairs == 0) {
322
- target.table = u32_table<A>(2, 6 + lg_k);
327
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
323
328
  } else {
324
329
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
325
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs, lg_k);
330
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
331
+ lg_k, source.table_data.get_allocator());
326
332
 
327
333
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
328
334
  if (pseudo_phase >= 16) throw std::logic_error("pseudo phase >= 16");
@@ -342,7 +348,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
342
348
  pairs[i] = (row << 6) | col;
343
349
  }
344
350
 
345
- target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
351
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
346
352
  }
347
353
  }
348
354
 
@@ -364,9 +370,10 @@ void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, u
364
370
  }
365
371
 
366
372
  template<typename A>
367
- vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const {
373
+ vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
374
+ uint8_t lg_k, const A& allocator) const {
368
375
  const size_t k = 1 << lg_k;
369
- vector_u32<A> pairs(num_pairs);
376
+ vector_u32<A> pairs(num_pairs, 0, allocator);
370
377
  const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
371
378
  low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
372
379
  return pairs;
@@ -388,7 +395,8 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
388
395
  }
389
396
 
390
397
  template<typename A>
391
- void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const {
398
+ void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
399
+ uint8_t lg_k, uint32_t num_coupons) const {
392
400
  const size_t k = 1 << lg_k;
393
401
  window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
394
402
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
@@ -710,9 +718,10 @@ void write_unary(
710
718
  // The empty space that this leaves at the beginning of the output array
711
719
  // will be filled in later by the caller.
712
720
  template<typename A>
713
- vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space) {
721
+ vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
722
+ uint32_t empty_space, const A& allocator) {
714
723
  const size_t output_length = empty_space + num_pairs_to_get;
715
- vector_u32<A> pairs(output_length);
724
+ vector_u32<A> pairs(output_length, 0, allocator);
716
725
  size_t pair_index = empty_space;
717
726
  for (unsigned row_index = 0; row_index < k; row_index++) {
718
727
  uint8_t byte = window[row_index];
@@ -49,7 +49,7 @@ template<typename A> class cpc_sketch_alloc;
49
49
  template<typename A> class cpc_union_alloc;
50
50
 
51
51
  // alias with default allocator for convenience
52
- typedef cpc_sketch_alloc<std::allocator<void>> cpc_sketch;
52
+ using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
53
53
 
54
54
  // allocation and initialization of global decompression (decoding) tables
55
55
  // call this before anything else if you want to control the initialization time
@@ -67,7 +67,10 @@ public:
67
67
  * @param lg_k base 2 logarithm of the number of bins in the sketch
68
68
  * @param seed for hash function
69
69
  */
70
- explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
70
+ explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
71
+
72
+ using allocator_type = A;
73
+ A get_allocator() const;
71
74
 
72
75
  /**
73
76
  * @return configured lg_k of this sketch
@@ -204,7 +207,7 @@ public:
204
207
 
205
208
  // This is a convenience alias for users
206
209
  // The type returned by the following serialize method
207
- typedef vector_u8<A> vector_bytes;
210
+ using vector_bytes = vector_u8<A>;
208
211
 
209
212
  /**
210
213
  * This method serializes the sketch as a vector of bytes.
@@ -221,7 +224,7 @@ public:
221
224
  * @param seed the seed for the hash function that was used to create the sketch
222
225
  * @return an instance of a sketch
223
226
  */
224
- static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
227
+ static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
225
228
 
226
229
  /**
227
230
  * This method deserializes a sketch from a given array of bytes.
@@ -230,7 +233,7 @@ public:
230
233
  * @param seed the seed for the hash function that was used to create the sketch
231
234
  * @return an instance of the sketch
232
235
  */
233
- static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
236
+ static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
234
237
 
235
238
  // for internal use
236
239
  uint32_t get_num_coupons() const;