datasketches 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -2,9 +2,7 @@
2
2
 
3
3
  #include <var_opt_sketch.hpp>
4
4
 
5
- #include <rice/Array.hpp>
6
- #include <rice/Constructor.hpp>
7
- #include <rice/Module.hpp>
5
+ #include "ext.h"
8
6
 
9
7
  using datasketches::var_opt_sketch;
10
8
 
@@ -19,7 +17,7 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
19
17
  .define_method("reset", &var_opt_sketch<T>::reset)
20
18
  .define_method(
21
19
  "samples",
22
- *[](var_opt_sketch<T>& self) {
20
+ [](var_opt_sketch<T>& self) {
23
21
  auto a = Rice::Array();
24
22
  for (auto item : self) {
25
23
  auto t = Rice::Array();
@@ -31,9 +29,9 @@ void bind_vo_sketch(Rice::Module &m, const char* name) {
31
29
  })
32
30
  .define_method(
33
31
  "update",
34
- *[](var_opt_sketch<T>& self, const T item) {
32
+ [](var_opt_sketch<T>& self, const T item) {
35
33
  self.update(item);
36
- });
34
+ }, Rice::Arg("item").keepAlive());
37
35
  }
38
36
 
39
37
  void init_vo(Rice::Module& m) {
@@ -1,3 +1,3 @@
1
1
  module DataSketches
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -96,6 +96,7 @@ add_subdirectory(fi)
96
96
  add_subdirectory(theta)
97
97
  add_subdirectory(sampling)
98
98
  add_subdirectory(tuple)
99
+ add_subdirectory(req)
99
100
 
100
101
  if (WITH_PYTHON)
101
102
  add_subdirectory(python)
@@ -1,18 +1,18 @@
1
- # DataSketches Core C++ Library Component
2
- This is the core C++ component of the DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
1
+ # Apache DataSketches Core C++ Library Component
2
+ This is the core C++ component of the Apache DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications.
3
3
 
4
4
  This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
5
5
 
6
6
  Note that we have a parallel core component for Java implementations of the same sketch algorithms,
7
7
  [datasketches-java](https://github.com/apache/datasketches-java).
8
8
 
9
- Please visit the main [DataSketches website](https://datasketches.apache.org) for more information.
9
+ Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information.
10
10
 
11
11
  If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us.
12
12
 
13
13
  ---
14
14
 
15
- This code requires C++11. It was tested with GCC 4.8.5 (standard in RedHat at the time of this writing), GCC 8.2.0 and Apple LLVM version 10.0.1 (clang-1001.0.46.4)
15
+ This code requires C++11.
16
16
 
17
17
  This includes Python bindings. For the Python interface, see the README notes in [the python subdirectory](https://github.com/apache/datasketches-cpp/tree/master/python).
18
18
 
@@ -3,6 +3,7 @@
3
3
  // * Changed input seed in MurmurHash3_x64_128 to uint64_t
4
4
  // * Define and use HashState reference to return result
5
5
  // * Made entire hash function defined inline
6
+ // * Added compute_seed_hash
6
7
  //-----------------------------------------------------------------------------
7
8
  // MurmurHash3 was written by Austin Appleby, and is placed in the public
8
9
  // domain. The author hereby disclaims copyright to this source code.
@@ -170,4 +171,10 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
170
171
 
171
172
  //-----------------------------------------------------------------------------
172
173
 
174
+ FORCE_INLINE uint16_t compute_seed_hash(uint64_t seed) {
175
+ HashState hashes;
176
+ MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
177
+ return static_cast<uint16_t>(hashes.h1 & 0xffff);
178
+ }
179
+
173
180
  #endif // _MURMURHASH3_H_
@@ -52,6 +52,18 @@ static inline size_t copy_to_mem(const void* src, void* dst, size_t size) {
52
52
  return size;
53
53
  }
54
54
 
55
+ template<typename T>
56
+ static inline size_t copy_to_mem(const T& item, void* dst) {
57
+ memcpy(dst, &item, sizeof(T));
58
+ return sizeof(T);
59
+ }
60
+
61
+ template<typename T>
62
+ static inline size_t copy_from_mem(const void* src, T& item) {
63
+ memcpy(&item, src, sizeof(T));
64
+ return sizeof(T);
65
+ }
66
+
55
67
  } // namespace
56
68
 
57
69
  #endif // _MEMORY_OPERATIONS_HPP_
@@ -15,6 +15,10 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ # two parts here, the common test code for other parts to use,
19
+ # and an integration test using the other parts of the library.
20
+
21
+ # common dependencies for tests
18
22
  add_library(common_test OBJECT "")
19
23
 
20
24
  set_target_properties(common_test PROPERTIES
@@ -36,3 +40,23 @@ target_sources(common_test
36
40
  ${CMAKE_CURRENT_SOURCE_DIR}/catch_runner.cpp
37
41
  ${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
38
42
  )
43
+
44
+ # now the integration test part
45
+ add_executable(integration_test)
46
+
47
+ target_link_libraries(integration_test cpc fi hll kll req sampling theta tuple common_test)
48
+
49
+ set_target_properties(integration_test PROPERTIES
50
+ CXX_STANDARD 11
51
+ CXX_STANDARD_REQUIRED YES
52
+ )
53
+
54
+ add_test(
55
+ NAME integration_test
56
+ COMMAND integration_test
57
+ )
58
+
59
+ target_sources(integration_test
60
+ PRIVATE
61
+ integration_test.cpp
62
+ )
@@ -0,0 +1,77 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include "cpc_sketch.hpp"
23
+ #include "cpc_union.hpp"
24
+ #include "frequent_items_sketch.hpp"
25
+ #include "hll.hpp"
26
+ #include "kll_sketch.hpp"
27
+ #include "req_sketch.hpp"
28
+ #include "var_opt_sketch.hpp"
29
+ #include "var_opt_union.hpp"
30
+ #include "theta_sketch.hpp"
31
+ #include "theta_union.hpp"
32
+ #include "theta_intersection.hpp"
33
+ #include "theta_a_not_b.hpp"
34
+ #include "tuple_sketch.hpp"
35
+ #include "tuple_union.hpp"
36
+ #include "tuple_intersection.hpp"
37
+ #include "tuple_a_not_b.hpp"
38
+
39
+ namespace datasketches {
40
+
41
+ template<typename Summary>
42
+ struct subtracting_intersection_policy {
43
+ void operator()(Summary& summary, const Summary& other) const {
44
+ summary -= other;
45
+ }
46
+ };
47
+
48
+ using tuple_intersection_float = tuple_intersection<float, subtracting_intersection_policy<float>>;
49
+
50
+ TEST_CASE("integration: declare all sketches", "[integration]") {
51
+ cpc_sketch cpc(12);
52
+ cpc_union cpc_u(12);
53
+
54
+ frequent_items_sketch<std::string> fi(100);
55
+
56
+ hll_sketch hll(13);
57
+ hll_union hll_u(13);
58
+
59
+ kll_sketch<double> kll(200);
60
+
61
+ req_sketch<double> req(12);
62
+
63
+ var_opt_sketch<std::string> vo(100);
64
+ var_opt_union<std::string> vo_u(100);
65
+
66
+ update_theta_sketch theta = update_theta_sketch::builder().build();
67
+ theta_union theta_u = theta_union::builder().build();
68
+ theta_intersection theta_i;
69
+ theta_a_not_b theta_anb;
70
+
71
+ auto tuple = update_tuple_sketch<float>::builder().build();
72
+ auto tuple_u = tuple_union<float>::builder().build();
73
+ tuple_intersection_float tuple_i;
74
+ tuple_a_not_b<float> tuple_anb;
75
+ }
76
+
77
+ } /* namespace datasketches */
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <new>
24
24
  #include <utility>
25
+ #include <stdexcept>
25
26
 
26
27
  // this allocator keeps the total allocated size in a global variable for testing
27
28
 
@@ -43,7 +44,14 @@ public:
43
44
  template <class U>
44
45
  struct rebind { typedef test_allocator<U> other; };
45
46
 
46
- test_allocator() {}
47
+ // this is to test that a given instance of an allocator is used instead of instantiating
48
+ static const bool DISALLOW_DEFAULT_CONSTRUCTOR = true;
49
+ test_allocator() {
50
+ if (DISALLOW_DEFAULT_CONSTRUCTOR) throw std::runtime_error("test_allocator: default constructor");
51
+ }
52
+ // call this constructor in tests and pass an allocator instance
53
+ test_allocator(int) {}
54
+
47
55
  test_allocator(const test_allocator&) {}
48
56
  template <class U>
49
57
  test_allocator(const test_allocator<U>&) {}
@@ -44,6 +44,8 @@ template<typename A> class u32_table;
44
44
 
45
45
  template<typename A>
46
46
  struct compressed_state {
47
+ explicit compressed_state(const A& allocator): table_data(allocator), table_data_words(0), table_num_entries(0),
48
+ window_data(allocator), window_data_words(0) {}
47
49
  vector_u32<A> table_data;
48
50
  uint32_t table_data_words;
49
51
  uint32_t table_num_entries; // can be different from the number of entries in the sketch in hybrid mode
@@ -53,6 +55,7 @@ struct compressed_state {
53
55
 
54
56
  template<typename A>
55
57
  struct uncompressed_state {
58
+ explicit uncompressed_state(const A& allocator): table(allocator), window(allocator) {}
56
59
  u32_table<A> table;
57
60
  vector_u8<A> window;
58
61
  };
@@ -129,14 +129,14 @@ private:
129
129
  void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
130
130
  void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
131
131
 
132
- vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const;
132
+ vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k, const A& allocator) const;
133
133
  void uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
134
134
 
135
135
  static size_t safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits);
136
136
  static size_t safe_length_for_compressed_window_buf(uint64_t k);
137
137
  static uint8_t determine_pseudo_phase(uint8_t lg_k, uint64_t c);
138
138
 
139
- static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space);
139
+ static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
140
140
  static inline uint64_t golomb_choose_number_of_base_bits(uint64_t k, uint64_t count);
141
141
  };
142
142
 
@@ -160,7 +160,7 @@ template<typename A>
160
160
  void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const {
161
161
  switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
162
162
  case cpc_sketch_alloc<A>::flavor::EMPTY:
163
- target.table = u32_table<A>(2, 6 + lg_k);
163
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
164
164
  break;
165
165
  case cpc_sketch_alloc<A>::flavor::SPARSE:
166
166
  uncompress_sparse_flavor(source, target, lg_k);
@@ -191,8 +191,9 @@ template<typename A>
191
191
  void cpc_compressor<A>::uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
192
192
  if (source.window_data.size() > 0) throw std::logic_error("unexpected sliding window");
193
193
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
194
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries, lg_k);
195
- target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k);
194
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
195
+ lg_k, source.table_data.get_allocator());
196
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k, pairs.get_allocator());
196
197
  }
197
198
 
198
199
  // This is complicated because it effectively builds a Sparse version
@@ -206,7 +207,7 @@ void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source
206
207
  if (pairs_from_table.size() > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, pairs_from_table.size());
207
208
  const size_t num_pairs_from_window = source.get_num_coupons() - pairs_from_table.size(); // because the window offset is zero
208
209
 
209
- vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size());
210
+ vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size(), source.get_allocator());
210
211
 
211
212
  u32_table<A>::merge(
212
213
  pairs_from_table.data(), 0, pairs_from_table.size(),
@@ -221,7 +222,8 @@ template<typename A>
221
222
  void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
222
223
  if (source.window_data.size() > 0) throw std::logic_error("window is not expected");
223
224
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
224
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries, lg_k);
225
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
226
+ lg_k, source.table_data.get_allocator());
225
227
 
226
228
  // In the hybrid flavor, some of these pairs actually
227
229
  // belong in the window, so we will separate them out,
@@ -240,7 +242,7 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
240
242
  pairs[next_true_pair++] = row_col; // move true pair down
241
243
  }
242
244
  }
243
- target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k);
245
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k, pairs.get_allocator());
244
246
  }
245
247
 
246
248
  template<typename A>
@@ -264,21 +266,23 @@ void cpc_compressor<A>::compress_pinned_flavor(const cpc_sketch_alloc<A>& source
264
266
  }
265
267
 
266
268
  template<typename A>
267
- void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
269
+ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
270
+ uint8_t lg_k, uint32_t num_coupons) const {
268
271
  if (source.window_data.size() == 0) throw std::logic_error("window is expected");
269
272
  uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
270
273
  const size_t num_pairs = source.table_num_entries;
271
274
  if (num_pairs == 0) {
272
- target.table = u32_table<A>(2, 6 + lg_k);
275
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
273
276
  } else {
274
277
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
275
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs, lg_k);
278
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
279
+ lg_k, source.table_data.get_allocator());
276
280
  // undo the compressor's 8-column shift
277
281
  for (size_t i = 0; i < num_pairs; i++) {
278
282
  if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
279
283
  pairs[i] += 8;
280
284
  }
281
- target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
285
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
282
286
  }
283
287
  }
284
288
 
@@ -314,15 +318,17 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
314
318
  }
315
319
 
316
320
  template<typename A>
317
- void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
321
+ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
322
+ uint8_t lg_k, uint32_t num_coupons) const {
318
323
  if (source.window_data.size() == 0) throw std::logic_error("window is expected");
319
324
  uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
320
325
  const size_t num_pairs = source.table_num_entries;
321
326
  if (num_pairs == 0) {
322
- target.table = u32_table<A>(2, 6 + lg_k);
327
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
323
328
  } else {
324
329
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
325
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs, lg_k);
330
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
331
+ lg_k, source.table_data.get_allocator());
326
332
 
327
333
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
328
334
  if (pseudo_phase >= 16) throw std::logic_error("pseudo phase >= 16");
@@ -342,7 +348,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
342
348
  pairs[i] = (row << 6) | col;
343
349
  }
344
350
 
345
- target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
351
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
346
352
  }
347
353
  }
348
354
 
@@ -364,9 +370,10 @@ void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, u
364
370
  }
365
371
 
366
372
  template<typename A>
367
- vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const {
373
+ vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
374
+ uint8_t lg_k, const A& allocator) const {
368
375
  const size_t k = 1 << lg_k;
369
- vector_u32<A> pairs(num_pairs);
376
+ vector_u32<A> pairs(num_pairs, 0, allocator);
370
377
  const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
371
378
  low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
372
379
  return pairs;
@@ -388,7 +395,8 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
388
395
  }
389
396
 
390
397
  template<typename A>
391
- void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const {
398
+ void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
399
+ uint8_t lg_k, uint32_t num_coupons) const {
392
400
  const size_t k = 1 << lg_k;
393
401
  window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
394
402
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
@@ -710,9 +718,10 @@ void write_unary(
710
718
  // The empty space that this leaves at the beginning of the output array
711
719
  // will be filled in later by the caller.
712
720
  template<typename A>
713
- vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space) {
721
+ vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
722
+ uint32_t empty_space, const A& allocator) {
714
723
  const size_t output_length = empty_space + num_pairs_to_get;
715
- vector_u32<A> pairs(output_length);
724
+ vector_u32<A> pairs(output_length, 0, allocator);
716
725
  size_t pair_index = empty_space;
717
726
  for (unsigned row_index = 0; row_index < k; row_index++) {
718
727
  uint8_t byte = window[row_index];
@@ -49,7 +49,7 @@ template<typename A> class cpc_sketch_alloc;
49
49
  template<typename A> class cpc_union_alloc;
50
50
 
51
51
  // alias with default allocator for convenience
52
- typedef cpc_sketch_alloc<std::allocator<void>> cpc_sketch;
52
+ using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
53
53
 
54
54
  // allocation and initialization of global decompression (decoding) tables
55
55
  // call this before anything else if you want to control the initialization time
@@ -67,7 +67,10 @@ public:
67
67
  * @param lg_k base 2 logarithm of the number of bins in the sketch
68
68
  * @param seed for hash function
69
69
  */
70
- explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
70
+ explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
71
+
72
+ using allocator_type = A;
73
+ A get_allocator() const;
71
74
 
72
75
  /**
73
76
  * @return configured lg_k of this sketch
@@ -204,7 +207,7 @@ public:
204
207
 
205
208
  // This is a convenience alias for users
206
209
  // The type returned by the following serialize method
207
- typedef vector_u8<A> vector_bytes;
210
+ using vector_bytes = vector_u8<A>;
208
211
 
209
212
  /**
210
213
  * This method serializes the sketch as a vector of bytes.
@@ -221,7 +224,7 @@ public:
221
224
  * @param seed the seed for the hash function that was used to create the sketch
222
225
  * @return an instance of a sketch
223
226
  */
224
- static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
227
+ static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
225
228
 
226
229
  /**
227
230
  * This method deserializes a sketch from a given array of bytes.
@@ -230,7 +233,7 @@ public:
230
233
  * @param seed the seed for the hash function that was used to create the sketch
231
234
  * @return an instance of the sketch
232
235
  */
233
- static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
236
+ static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
234
237
 
235
238
  // for internal use
236
239
  uint32_t get_num_coupons() const;