datasketches 0.2.2 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +8 -8
  6. data/ext/datasketches/kll_wrapper.cpp +5 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +27 -5
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  16. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +16 -0
  18. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  19. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  20. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  21. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  22. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  26. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +1 -1
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  31. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  34. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  35. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  36. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +30 -12
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  38. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  42. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  44. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  45. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  49. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  50. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  51. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  52. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  53. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  54. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  55. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  56. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  57. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +103 -44
  58. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +110 -130
  59. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +156 -23
  60. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  61. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  62. data/vendor/datasketches-cpp/python/CMakeLists.txt +17 -6
  63. data/vendor/datasketches-cpp/python/README.md +57 -50
  64. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  65. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  66. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  67. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  68. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +49 -14
  69. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  70. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  71. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  72. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +2 -2
  73. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +12 -5
  74. data/vendor/datasketches-cpp/python/tests/kll_test.py +12 -6
  75. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  76. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  77. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  78. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  79. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  80. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  81. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  82. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  83. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  84. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  85. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  86. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  87. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  88. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  89. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  90. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  91. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  92. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  93. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  94. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  95. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  96. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  97. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +66 -61
  98. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  99. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  100. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +54 -12
  101. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +45 -34
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  107. data/vendor/datasketches-cpp/setup.py +10 -7
  108. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  110. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  114. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  115. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  116. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  117. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  118. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  120. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  121. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +9 -5
  122. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +39 -10
  123. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  124. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  125. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  126. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  127. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  130. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  131. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  132. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  133. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  134. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  135. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  136. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  137. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  138. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  141. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  142. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  143. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  144. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  145. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  146. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  147. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  148. metadata +34 -12
  149. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  150. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  151. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  152. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  153. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  154. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -0,0 +1,121 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef QUANTILE_SKETCH_SORTED_VIEW_HPP_
21
+ #define QUANTILE_SKETCH_SORTED_VIEW_HPP_
22
+
23
+ #include <functional>
24
+
25
+ namespace datasketches {
26
+
27
+ template<
28
+ typename T,
29
+ typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
30
+ typename Allocator
31
+ >
32
+ class quantile_sketch_sorted_view {
33
+ public:
34
+ using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
35
+ using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
36
+ using Container = std::vector<Entry, AllocEntry>;
37
+
38
+ quantile_sketch_sorted_view(uint32_t num, const Allocator& allocator);
39
+
40
+ template<typename Iterator>
41
+ void add(Iterator begin, Iterator end, uint64_t weight);
42
+
43
+ template<bool inclusive>
44
+ void convert_to_cummulative();
45
+
46
+ class const_iterator;
47
+ const_iterator begin() const;
48
+ const_iterator end() const;
49
+
50
+ size_t size() const;
51
+
52
+ // makes sense only with cumulative weight
53
+ using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
54
+ quantile_return_type get_quantile(double rank) const;
55
+
56
+ private:
57
+ static inline const T& deref_helper(const T* t) { return *t; }
58
+ static inline T deref_helper(T t) { return t; }
59
+
60
+ struct compare_pairs_by_first {
61
+ bool operator()(const Entry& a, const Entry& b) const {
62
+ return Comparator()(deref_helper(a.first), deref_helper(b.first));
63
+ }
64
+ };
65
+
66
+ struct compare_pairs_by_second {
67
+ bool operator()(const Entry& a, const Entry& b) const {
68
+ return a.second < b.second;
69
+ }
70
+ };
71
+
72
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
73
+ static inline T ref_helper(const T& t) { return t; }
74
+
75
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
76
+ static inline const T* ref_helper(const T& t) { return std::addressof(t); }
77
+
78
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
79
+ static inline Entry make_dummy_entry(uint64_t weight) { return Entry(0, weight); }
80
+
81
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
82
+ static inline Entry make_dummy_entry(uint64_t weight) { return Entry(nullptr, weight); }
83
+
84
+ uint64_t total_weight_;
85
+ Container entries_;
86
+ };
87
+
88
+ template<typename T, typename C, typename A>
89
+ class quantile_sketch_sorted_view<T, C, A>::const_iterator: public quantile_sketch_sorted_view<T, C, A>::Container::const_iterator {
90
+ public:
91
+ using Base = typename quantile_sketch_sorted_view<T, C, A>::Container::const_iterator;
92
+ using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
93
+
94
+ const_iterator(const Base& it): Base(it) {}
95
+
96
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
97
+ value_type operator*() const { return Base::operator*(); }
98
+
99
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
100
+ value_type operator*() const { return value_type(*(Base::operator*().first), Base::operator*().second); }
101
+
102
+ class return_value_holder {
103
+ public:
104
+ return_value_holder(value_type value): value_(value) {}
105
+ const value_type* operator->() const { return &value_; }
106
+ private:
107
+ value_type value_;
108
+ };
109
+
110
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
111
+ const value_type* operator->() const { return Base::operator->(); }
112
+
113
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
114
+ return_value_holder operator->() const { return **this; }
115
+ };
116
+
117
+ } /* namespace datasketches */
118
+
119
+ #include "quantile_sketch_sorted_view_impl.hpp"
120
+
121
+ #endif
@@ -0,0 +1,91 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef QUANTILE_SKETCH_SORTED_VIEW_IMPL_HPP_
21
+ #define QUANTILE_SKETCH_SORTED_VIEW_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+ #include <stdexcept>
25
+
26
+ namespace datasketches {
27
+
28
+ template<typename T, typename C, typename A>
29
+ quantile_sketch_sorted_view<T, C, A>::quantile_sketch_sorted_view(uint32_t num, const A& allocator):
30
+ total_weight_(0),
31
+ entries_(allocator)
32
+ {
33
+ entries_.reserve(num);
34
+ }
35
+
36
+ template<typename T, typename C, typename A>
37
+ template<typename Iterator>
38
+ void quantile_sketch_sorted_view<T, C, A>::add(Iterator first, Iterator last, uint64_t weight) {
39
+ const size_t size_before = entries_.size();
40
+ for (auto it = first; it != last; ++it) entries_.push_back(Entry(ref_helper(*it), weight));
41
+ if (size_before > 0) {
42
+ Container tmp(entries_.get_allocator());
43
+ tmp.reserve(entries_.capacity());
44
+ std::merge(
45
+ entries_.begin(), entries_.begin() + size_before,
46
+ entries_.begin() + size_before, entries_.end(),
47
+ std::back_inserter(tmp), compare_pairs_by_first()
48
+ );
49
+ std::swap(tmp, entries_);
50
+ }
51
+ }
52
+
53
+ template<typename T, typename C, typename A>
54
+ template<bool inclusive>
55
+ void quantile_sketch_sorted_view<T, C, A>::convert_to_cummulative() {
56
+ uint64_t subtotal = 0;
57
+ for (auto& entry: entries_) {
58
+ const uint64_t new_subtotal = subtotal + entry.second;
59
+ entry.second = inclusive ? new_subtotal : subtotal;
60
+ subtotal = new_subtotal;
61
+ }
62
+ total_weight_ = subtotal;
63
+ }
64
+
65
+ template<typename T, typename C, typename A>
66
+ auto quantile_sketch_sorted_view<T, C, A>::get_quantile(double rank) const -> quantile_return_type {
67
+ if (total_weight_ == 0) throw std::invalid_argument("supported for cumulative weight only");
68
+ uint64_t weight = static_cast<uint64_t>(rank * total_weight_);
69
+ auto it = std::lower_bound(entries_.begin(), entries_.end(), make_dummy_entry<T>(weight), compare_pairs_by_second());
70
+ if (it == entries_.end()) return deref_helper(entries_[entries_.size() - 1].first);
71
+ return deref_helper(it->first);
72
+ }
73
+
74
+ template<typename T, typename C, typename A>
75
+ auto quantile_sketch_sorted_view<T, C, A>::begin() const -> const_iterator {
76
+ return entries_.begin();
77
+ }
78
+
79
+ template<typename T, typename C, typename A>
80
+ auto quantile_sketch_sorted_view<T, C, A>::end() const -> const_iterator {
81
+ return entries_.end();
82
+ }
83
+
84
+ template<typename T, typename C, typename A>
85
+ size_t quantile_sketch_sorted_view<T, C, A>::size() const {
86
+ return entries_.size();
87
+ }
88
+
89
+ } /* namespace datasketches */
90
+
91
+ #endif
@@ -20,7 +20,9 @@
20
20
  #ifndef CLASS_TEST_TYPE_HPP_
21
21
  #define CLASS_TEST_TYPE_HPP_
22
22
 
23
+ #include <cstring>
23
24
  #include <iostream>
25
+ #include "memory_operations.hpp"
24
26
 
25
27
  namespace datasketches {
26
28
 
@@ -32,43 +32,23 @@ target_include_directories(cpc
32
32
  target_link_libraries(cpc INTERFACE common)
33
33
  target_compile_features(cpc INTERFACE cxx_std_11)
34
34
 
35
- set(cpc_HEADERS "")
36
- list(APPEND cpc_HEADERS "include/compression_data.hpp")
37
- list(APPEND cpc_HEADERS "include/cpc_common.hpp")
38
- list(APPEND cpc_HEADERS "include/cpc_compressor.hpp")
39
- list(APPEND cpc_HEADERS "include/cpc_compressor_impl.hpp")
40
- list(APPEND cpc_HEADERS "include/cpc_confidence.hpp")
41
- list(APPEND cpc_HEADERS "include/cpc_sketch.hpp")
42
- list(APPEND cpc_HEADERS "include/cpc_sketch_impl.hpp")
43
- list(APPEND cpc_HEADERS "include/cpc_union.hpp")
44
- list(APPEND cpc_HEADERS "include/cpc_union_impl.hpp")
45
- list(APPEND cpc_HEADERS "include/cpc_util.hpp")
46
- list(APPEND cpc_HEADERS "include/icon_estimator.hpp")
47
- list(APPEND cpc_HEADERS "include/kxp_byte_lookup.hpp")
48
- list(APPEND cpc_HEADERS "include/u32_table.hpp")
49
- list(APPEND cpc_HEADERS "include/u32_table_impl.hpp")
50
-
51
35
  install(TARGETS cpc
52
36
  EXPORT ${PROJECT_NAME}
53
37
  )
54
38
 
55
- install(FILES ${cpc_HEADERS}
39
+ install(FILES
40
+ include/compression_data.hpp
41
+ include/cpc_common.hpp
42
+ include/cpc_compressor.hpp
43
+ include/cpc_compressor_impl.hpp
44
+ include/cpc_confidence.hpp
45
+ include/cpc_sketch.hpp
46
+ include/cpc_sketch_impl.hpp
47
+ include/cpc_union.hpp
48
+ include/cpc_union_impl.hpp
49
+ include/cpc_util.hpp
50
+ include/icon_estimator.hpp
51
+ include/kxp_byte_lookup.hpp
52
+ include/u32_table.hpp
53
+ include/u32_table_impl.hpp
56
54
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
57
-
58
- target_sources(cpc
59
- INTERFACE
60
- ${CMAKE_CURRENT_SOURCE_DIR}/include/compression_data.hpp
61
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_common.hpp
62
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_compressor.hpp
63
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_compressor_impl.hpp
64
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_confidence.hpp
65
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_sketch.hpp
66
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_sketch_impl.hpp
67
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_union.hpp
68
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_union_impl.hpp
69
- ${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_util.hpp
70
- ${CMAKE_CURRENT_SOURCE_DIR}/include/icon_estimator.hpp
71
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kxp_byte_lookup.hpp
72
- ${CMAKE_CURRENT_SOURCE_DIR}/include/u32_table.hpp
73
- ${CMAKE_CURRENT_SOURCE_DIR}/include/u32_table_impl.hpp
74
- )
@@ -26,9 +26,16 @@
26
26
 
27
27
  namespace datasketches {
28
28
 
29
- static const uint8_t CPC_MIN_LG_K = 4;
30
- static const uint8_t CPC_MAX_LG_K = 26;
31
- static const uint8_t CPC_DEFAULT_LG_K = 11;
29
+ namespace cpc_constants {
30
+ const uint8_t MIN_LG_K = 4;
31
+ const uint8_t MAX_LG_K = 26;
32
+ const uint8_t DEFAULT_LG_K = 11;
33
+ }
34
+
35
+ // TODO: Redundant and deprecated. Will be removed in next major version release.
36
+ static const uint8_t CPC_MIN_LG_K = cpc_constants::MIN_LG_K;
37
+ static const uint8_t CPC_MAX_LG_K = cpc_constants::MAX_LG_K;
38
+ static const uint8_t CPC_DEFAULT_LG_K = cpc_constants::DEFAULT_LG_K;
32
39
 
33
40
  template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
34
41
  template<typename A> using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
@@ -23,6 +23,7 @@
23
23
  #define CPC_COMPRESSOR_IMPL_HPP_
24
24
 
25
25
  #include <memory>
26
+ #include <stdexcept>
26
27
 
27
28
  #include "compression_data.hpp"
28
29
  #include "cpc_util.hpp"
@@ -23,6 +23,7 @@
23
23
  #define CPC_CONFIDENCE_HPP_
24
24
 
25
25
  #include <cmath>
26
+ #include <stdexcept>
26
27
 
27
28
  #include "cpc_sketch.hpp"
28
29
 
@@ -67,7 +67,7 @@ public:
67
67
  * @param lg_k base 2 logarithm of the number of bins in the sketch
68
68
  * @param seed for hash function
69
69
  */
70
- explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
70
+ explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
71
71
 
72
72
  using allocator_type = A;
73
73
  A get_allocator() const;
@@ -381,7 +381,9 @@ void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
381
381
 
382
382
  template<typename A>
383
383
  string<A> cpc_sketch_alloc<A>::to_string() const {
384
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
384
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
385
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
386
+ std::ostringstream os;
385
387
  os << "### CPC sketch summary:" << std::endl;
386
388
  os << " lg_k : " << std::to_string(lg_k) << std::endl;
387
389
  os << " seed hash : " << std::hex << compute_seed_hash(seed) << std::dec << std::endl;
@@ -392,14 +394,14 @@ string<A> cpc_sketch_alloc<A>::to_string() const {
392
394
  os << " HIP estimate : " << hip_est_accum << std::endl;
393
395
  os << " kxp : " << kxp << std::endl;
394
396
  }
395
- os << " intresting col : " << std::to_string(first_interesting_column) << std::endl;
397
+ os << " interesting col: " << std::to_string(first_interesting_column) << std::endl;
396
398
  os << " table entries : " << surprising_value_table.get_num_items() << std::endl;
397
399
  os << " window : " << (sliding_window.size() == 0 ? "not " : "") << "allocated" << std::endl;
398
400
  if (sliding_window.size() > 0) {
399
401
  os << " window offset : " << std::to_string(window_offset) << std::endl;
400
402
  }
401
403
  os << "### End sketch summary" << std::endl;
402
- return os.str();
404
+ return string<A>(os.str().c_str(), sliding_window.get_allocator());
403
405
  }
404
406
 
405
407
  template<typename A>
@@ -45,7 +45,7 @@ public:
45
45
  * @param lg_k base 2 logarithm of the number of bins in the sketch
46
46
  * @param seed for hash function
47
47
  */
48
- explicit cpc_union_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
48
+ explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
49
49
 
50
50
  cpc_union_alloc(const cpc_union_alloc<A>& other);
51
51
  cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
@@ -22,6 +22,8 @@
22
22
 
23
23
  #include "count_zeros.hpp"
24
24
 
25
+ #include <stdexcept>
26
+
25
27
  namespace datasketches {
26
28
 
27
29
  template<typename A>
@@ -34,7 +36,7 @@ bit_matrix(allocator)
34
36
  if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
35
37
  throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
36
38
  }
37
- accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
39
+ accumulator = new (AllocCpc(allocator).allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
38
40
  }
39
41
 
40
42
  template<typename A>
@@ -45,7 +47,7 @@ accumulator(other.accumulator),
45
47
  bit_matrix(other.bit_matrix)
46
48
  {
47
49
  if (accumulator != nullptr) {
48
- accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
50
+ accumulator = new (AllocCpc(accumulator->get_allocator()).allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
49
51
  }
50
52
  }
51
53
 
@@ -62,8 +64,9 @@ bit_matrix(std::move(other.bit_matrix))
62
64
  template<typename A>
63
65
  cpc_union_alloc<A>::~cpc_union_alloc() {
64
66
  if (accumulator != nullptr) {
67
+ AllocCpc allocator(accumulator->get_allocator());
65
68
  accumulator->~cpc_sketch_alloc<A>();
66
- AllocCpc().deallocate(accumulator, 1);
69
+ allocator.deallocate(accumulator, 1);
67
70
  }
68
71
  }
69
72
 
@@ -181,7 +184,7 @@ template<typename A>
181
184
  cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
182
185
  if (lg_k != accumulator->get_lg_k()) throw std::logic_error("lg_k != accumulator->lg_k");
183
186
  if (accumulator->get_num_coupons() == 0) {
184
- return cpc_sketch_alloc<A>(lg_k, seed);
187
+ return cpc_sketch_alloc<A>(lg_k, seed, accumulator->get_allocator());
185
188
  }
186
189
  if (accumulator->determine_flavor() != cpc_sketch_alloc<A>::flavor::SPARSE) throw std::logic_error("wrong flavor");
187
190
  cpc_sketch_alloc<A> copy(*accumulator);
@@ -242,8 +245,9 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
242
245
  template<typename A>
243
246
  void cpc_union_alloc<A>::switch_to_bit_matrix() {
244
247
  bit_matrix = accumulator->build_bit_matrix();
248
+ AllocCpc allocator(accumulator->get_allocator());
245
249
  accumulator->~cpc_sketch_alloc<A>();
246
- AllocCpc().deallocate(accumulator, 1);
250
+ allocator.deallocate(accumulator, 1);
247
251
  accumulator = nullptr;
248
252
  }
249
253
 
@@ -324,7 +328,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
324
328
  if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected");
325
329
  if (!accumulator->is_empty()) {
326
330
  cpc_sketch_alloc<A> old_accumulator(*accumulator);
327
- *accumulator = cpc_sketch_alloc<A>(new_lg_k, seed);
331
+ *accumulator = cpc_sketch_alloc<A>(new_lg_k, seed, old_accumulator.get_allocator());
328
332
  walk_table_updating_sketch(old_accumulator.surprising_value_table);
329
333
  }
330
334
  lg_k = new_lg_k;
@@ -25,6 +25,7 @@
25
25
  #include <catch.hpp>
26
26
 
27
27
  #include "cpc_sketch.hpp"
28
+ #include "cpc_union.hpp"
28
29
  #include "test_allocator.hpp"
29
30
 
30
31
  namespace datasketches {
@@ -234,4 +235,20 @@ TEST_CASE("cpc sketch allocation: serialize deserialize sliding, bytes", "[cpc_s
234
235
  REQUIRE(test_allocator_net_allocations == 0);
235
236
  }
236
237
 
238
+ using cpc_union_test_alloc = cpc_union_alloc<test_allocator<uint8_t>>;
239
+
240
+ TEST_CASE("cpc sketch allocation: union") {
241
+ cpc_sketch_test_alloc s1(11, DEFAULT_SEED, 0);
242
+ s1.update(1);
243
+
244
+ cpc_sketch_test_alloc s2(11, DEFAULT_SEED, 0);
245
+ s2.update(2);
246
+
247
+ cpc_union_test_alloc u(11, DEFAULT_SEED, 0);
248
+ u.update(s1);
249
+ u.update(s2);
250
+ auto s3 = u.get_result();
251
+ REQUIRE_FALSE(s3.is_empty());
252
+ }
253
+
237
254
  } /* namespace datasketches */
@@ -21,6 +21,7 @@
21
21
  #include <cstring>
22
22
  #include <sstream>
23
23
  #include <fstream>
24
+ #include <stdexcept>
24
25
 
25
26
  #include <catch.hpp>
26
27
 
@@ -21,6 +21,8 @@
21
21
 
22
22
  #include "cpc_union.hpp"
23
23
 
24
+ #include <stdexcept>
25
+
24
26
  namespace datasketches {
25
27
 
26
28
  static const double RELATIVE_ERROR_FOR_LG_K_11 = 0.02;
@@ -32,23 +32,13 @@ target_include_directories(fi
32
32
  target_link_libraries(fi INTERFACE common)
33
33
  target_compile_features(fi INTERFACE cxx_std_11)
34
34
 
35
- set(fi_HEADERS "")
36
- list(APPEND fi_HEADERS "include/frequent_items_sketch.hpp")
37
- list(APPEND fi_HEADERS "include/frequent_items_sketch_impl.hpp")
38
- list(APPEND fi_HEADERS "include/reverse_purge_hash_map.hpp")
39
- list(APPEND fi_HEADERS "include/reverse_purge_hash_map_impl.hpp")
40
-
41
35
  install(TARGETS fi
42
36
  EXPORT ${PROJECT_NAME}
43
37
  )
44
38
 
45
- install(FILES ${fi_HEADERS}
39
+ install(FILES
40
+ include/frequent_items_sketch.hpp
41
+ include/frequent_items_sketch_impl.hpp
42
+ include/reverse_purge_hash_map.hpp
43
+ include/reverse_purge_hash_map_impl.hpp
46
44
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
47
-
48
- target_sources(fi
49
- INTERFACE
50
- ${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch.hpp
51
- ${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch_impl.hpp
52
- ${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map.hpp
53
- ${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map_impl.hpp
54
- )
@@ -46,7 +46,7 @@ template<
46
46
  typename W = uint64_t,
47
47
  typename H = std::hash<T>,
48
48
  typename E = std::equal_to<T>,
49
- typename S = serde<T>,
49
+ typename S = serde<T>, // deprecated, to be removed in the next major version
50
50
  typename A = std::allocator<T>
51
51
  >
52
52
  class frequent_items_sketch {
@@ -225,46 +225,78 @@ public:
225
225
  /**
226
226
  * Computes size needed to serialize the current state of the sketch.
227
227
  * This can be expensive since every item needs to be looked at.
228
+ * @param instance of a SerDe
228
229
  * @return size in bytes needed to serialize this sketch
229
230
  */
230
- size_t get_serialized_size_bytes() const;
231
+ template<typename SerDe = S>
232
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
231
233
 
232
234
  /**
233
235
  * This method serializes the sketch into a given stream in a binary form
234
236
  * @param os output stream
237
+ * @param instance of a SerDe
235
238
  */
236
- void serialize(std::ostream& os) const;
239
+ template<typename SerDe = S>
240
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
237
241
 
238
242
  // This is a convenience alias for users
239
243
  // The type returned by the following serialize method
240
244
  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
241
245
 
242
-
243
246
  /**
244
247
  * This method serializes the sketch as a vector of bytes.
245
248
  * An optional header can be reserved in front of the sketch.
246
249
  * It is a blank space of a given size.
247
250
  * This header is used in Datasketches PostgreSQL extension.
248
251
  * @param header_size_bytes space to reserve in front of the sketch
252
+ * @param instance of a SerDe
249
253
  * @return serialized sketch as a vector of bytes
250
254
  */
251
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
255
+ template<typename SerDe = S>
256
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
252
257
 
253
258
  /**
254
259
  * This method deserializes a sketch from a given stream.
255
260
  * @param is input stream
261
+ * @param instance of an Allocator
256
262
  * @return an instance of the sketch
263
+ *
264
+ * Deprecated, to be removed in the next major version
257
265
  */
258
266
  static frequent_items_sketch deserialize(std::istream& is, const A& allocator = A());
259
267
 
268
+ /**
269
+ * This method deserializes a sketch from a given stream.
270
+ * @param is input stream
271
+ * @param instance of a SerDe
272
+ * @param instance of an Allocator
273
+ * @return an instance of the sketch
274
+ */
275
+ template<typename SerDe = S>
276
+ static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
277
+
260
278
  /**
261
279
  * This method deserializes a sketch from a given array of bytes.
262
280
  * @param bytes pointer to the array of bytes
263
281
  * @param size the size of the array
282
+ * @param instance of an Allocator
264
283
  * @return an instance of the sketch
284
+ *
285
+ * Deprecated, to be removed in the next major version
265
286
  */
266
287
  static frequent_items_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
267
288
 
289
+ /**
290
+ * This method deserializes a sketch from a given array of bytes.
291
+ * @param bytes pointer to the array of bytes
292
+ * @param size the size of the array
293
+ * @param instance of a SerDe
294
+ * @param instance of an Allocator
295
+ * @return an instance of the sketch
296
+ */
297
+ template<typename SerDe = S>
298
+ static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
299
+
268
300
  /**
269
301
  * Returns a human readable summary of this sketch
270
302
  * @param print_items if true include the list of items retained by the sketch