datasketches 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -0,0 +1,69 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef REQ_QUANTILE_CALCULATOR_HPP_
21
+ #define REQ_QUANTILE_CALCULATOR_HPP_
22
+
23
+ #include <functional>
24
+
25
+ namespace datasketches {
26
+
27
+ template<
28
+ typename T,
29
+ typename Comparator,
30
+ typename Allocator
31
+ >
32
+ class req_quantile_calculator {
33
+ public:
34
+ req_quantile_calculator(uint64_t n, const Allocator& allocator);
35
+
36
+ void add(const T* begin, const T* end, uint8_t lg_weight);
37
+
38
+ template<bool inclusive>
39
+ void convert_to_cummulative();
40
+
41
+ const T* get_quantile(double rank) const;
42
+
43
+ private:
44
+ using Entry = std::pair<const T*, uint64_t>;
45
+ using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
46
+ using Container = std::vector<Entry, AllocEntry>;
47
+
48
+ template<typename C>
49
+ struct compare_pairs_by_first_ptr {
50
+ bool operator()(const Entry& a, const Entry& b) {
51
+ return C()(*a.first, *b.first);
52
+ }
53
+ };
54
+
55
+ struct compare_pairs_by_second {
56
+ bool operator()(const Entry& a, const Entry& b) {
57
+ return a.second < b.second;
58
+ }
59
+ };
60
+
61
+ uint64_t n_;
62
+ Container entries_;
63
+ };
64
+
65
+ } /* namespace datasketches */
66
+
67
+ #include "req_quantile_calculator_impl.hpp"
68
+
69
+ #endif
@@ -0,0 +1,60 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef REQ_QUANTILE_CALCULATOR_IMPL_HPP_
21
+ #define REQ_QUANTILE_CALCULATOR_IMPL_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ template<typename T, typename C, typename A>
26
+ req_quantile_calculator<T, C, A>::req_quantile_calculator(uint64_t n, const A& allocator):
27
+ n_(n),
28
+ entries_(allocator)
29
+ {}
30
+
31
+ template<typename T, typename C, typename A>
32
+ void req_quantile_calculator<T, C, A>::add(const T* begin, const T* end, uint8_t lg_weight) {
33
+ if (entries_.capacity() < entries_.size() + std::distance(begin, end)) entries_.reserve(entries_.size() + std::distance(begin, end));
34
+ const size_t size_before = entries_.size();
35
+ for (auto it = begin; it != end; ++it) entries_.push_back(Entry(it, 1 << lg_weight));
36
+ if (size_before > 0) std::inplace_merge(entries_.begin(), entries_.begin() + size_before, entries_.end(), compare_pairs_by_first_ptr<C>());
37
+ }
38
+
39
+ template<typename T, typename C, typename A>
40
+ template<bool inclusive>
41
+ void req_quantile_calculator<T, C, A>::convert_to_cummulative() {
42
+ uint64_t subtotal = 0;
43
+ for (auto& entry: entries_) {
44
+ const uint64_t new_subtotal = subtotal + entry.second;
45
+ entry.second = inclusive ? new_subtotal : subtotal;
46
+ subtotal = new_subtotal;
47
+ }
48
+ }
49
+
50
+ template<typename T, typename C, typename A>
51
+ const T* req_quantile_calculator<T, C, A>::get_quantile(double rank) const {
52
+ uint64_t weight = static_cast<uint64_t>(rank * n_);
53
+ auto it = std::lower_bound(entries_.begin(), entries_.end(), Entry(nullptr, weight), compare_pairs_by_second());
54
+ if (it == entries_.end()) return entries_[entries_.size() - 1].first;
55
+ return it->first;
56
+ }
57
+
58
+ } /* namespace datasketches */
59
+
60
+ #endif
@@ -0,0 +1,395 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef REQ_SKETCH_HPP_
21
+ #define REQ_SKETCH_HPP_
22
+
23
+ #include "req_common.hpp"
24
+ #include "req_compactor.hpp"
25
+ #include "req_quantile_calculator.hpp"
26
+
27
+ namespace datasketches {
28
+
29
+ template<
30
+ typename T,
31
+ typename Comparator = std::less<T>,
32
+ typename SerDe = serde<T>,
33
+ typename Allocator = std::allocator<T>
34
+ >
35
+ class req_sketch {
36
+ public:
37
+ using Compactor = req_compactor<T, Comparator, Allocator>;
38
+ using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>;
39
+ using AllocDouble = typename std::allocator_traits<Allocator>::template rebind_alloc<double>;
40
+ using vector_double = std::vector<double, AllocDouble>;
41
+
42
+ /**
43
+ * Constructor
44
+ * @param k Controls the size and error of the sketch. It must be even and in the range [4, 1024], inclusive.
45
+ * Value of 12 roughly corresponds to 1% relative error guarantee at 95% confidence.
46
+ * @param hra if true, the default, the high ranks are prioritized for better
47
+ * accuracy. Otherwise the low ranks are prioritized for better accuracy.
48
+ * @param allocator to use by this instance
49
+ */
50
+ explicit req_sketch(uint16_t k, bool hra = true, const Allocator& allocator = Allocator());
51
+
52
+ ~req_sketch();
53
+ req_sketch(const req_sketch& other);
54
+ req_sketch(req_sketch&& other) noexcept;
55
+ req_sketch& operator=(const req_sketch& other);
56
+ req_sketch& operator=(req_sketch&& other);
57
+
58
+ /**
59
+ * Returns configured parameter K
60
+ * @return parameter K
61
+ */
62
+ uint16_t get_k() const;
63
+
64
+ /**
65
+ * Returns configured parameter High Rank Accuracy
66
+ * @return parameter HRA
67
+ */
68
+ bool is_HRA() const;
69
+
70
+ /**
71
+ * Returns true if this sketch is empty.
72
+ * @return empty flag
73
+ */
74
+ bool is_empty() const;
75
+
76
+ /**
77
+ * Returns the length of the input stream.
78
+ * @return stream length
79
+ */
80
+ uint64_t get_n() const;
81
+
82
+ /**
83
+ * Returns the number of retained items in the sketch.
84
+ * @return number of retained items
85
+ */
86
+ uint32_t get_num_retained() const;
87
+
88
+ /**
89
+ * Returns true if this sketch is in estimation mode.
90
+ * @return estimation mode flag
91
+ */
92
+ bool is_estimation_mode() const;
93
+
94
+ template<typename FwdT>
95
+ void update(FwdT&& item);
96
+
97
+ template<typename FwdSk>
98
+ void merge(FwdSk&& other);
99
+
100
+ /**
101
+ * Returns the min value of the stream.
102
+ * For floating point types: if the sketch is empty this returns NaN.
103
+ * For other types: if the sketch is empty this throws runtime_error.
104
+ * @return the min value of the stream
105
+ */
106
+ const T& get_min_value() const;
107
+
108
+ /**
109
+ * Returns the max value of the stream.
110
+ * For floating point types: if the sketch is empty this returns NaN.
111
+ * For other types: if the sketch is empty this throws runtime_error.
112
+ * @return the max value of the stream
113
+ */
114
+ const T& get_max_value() const;
115
+
116
+ /**
117
+ * Returns an approximation to the normalized (fractional) rank of the given item from 0 to 1 inclusive.
118
+ * With the template parameter inclusive=true the weight of the given item is included into the rank.
119
+ * Otherwise the rank equals the sum of the weights of items less than the given item according to the Comparator.
120
+ *
121
+ * <p>If the sketch is empty this returns NaN.
122
+ *
123
+ * @param item to be ranked
124
+ * @return an approximate rank of the given item
125
+ */
126
+
127
+ template<bool inclusive = false>
128
+ double get_rank(const T& item) const;
129
+
130
+ /**
131
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
132
+ * given a set of split points (values).
133
+ *
134
+ * <p>If the sketch is empty this returns an empty vector.
135
+ *
136
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
137
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
138
+ * The definition of an "interval" is inclusive of the left split point (or minimum value) and
139
+ * exclusive of the right split point, with the exception that the last interval will include
140
+ * the maximum value.
141
+ * It is not necessary to include either the min or max values in these split points.
142
+ *
143
+ * @return an array of m+1 doubles each of which is an approximation
144
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
145
+ * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
146
+ * split point, with the exception that the last interval will include the maximum value.
147
+ * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
148
+ * split point.
149
+ */
150
+ template<bool inclusive = false>
151
+ vector_double get_PMF(const T* split_points, uint32_t size) const;
152
+
153
+ /**
154
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
155
+ * cumulative analog of the PMF, of the input stream given a set of split points (values).
156
+ *
157
+ * <p>If the sketch is empty this returns an empty vector.
158
+ *
159
+ * @param split_points an array of <i>m</i> unique, monotonically increasing float values
160
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
161
+ * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
162
+ * split point, with the exception that the last interval will include the maximum value.
163
+ * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
164
+ * split point.
165
+ * It is not necessary to include either the min or max values in these split points.
166
+ *
167
+ * @return an array of m+1 double values, which are a consecutive approximation to the CDF
168
+ * of the input stream given the split_points. The value at array position j of the returned
169
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
170
+ * array.
171
+ */
172
+ template<bool inclusive = false>
173
+ vector_double get_CDF(const T* split_points, uint32_t size) const;
174
+
175
+ /**
176
+ * Returns an approximate quantile of the given normalized rank.
177
+ * The normalized rank must be in the range [0.0, 1.0] (both inclusive).
178
+ * @param rank the given normalized rank
179
+ * @return approximate quantile given the normalized rank
180
+ */
181
+ template<bool inclusive = false>
182
+ const T& get_quantile(double rank) const;
183
+
184
+ /**
185
+ * Returns an array of quantiles that correspond to the given array of normalized ranks.
186
+ * @param ranks given array of normalized ranks.
187
+ * @return array of quantiles that correspond to the given array of normalized ranks
188
+ */
189
+ template<bool inclusive = false>
190
+ std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size) const;
191
+
192
+ /**
193
+ * Returns an approximate lower bound of the given noramalized rank.
194
+ * @param rank the given rank, a value between 0 and 1.0.
195
+ * @param num_std_dev the number of standard deviations. Must be 1, 2, or 3.
196
+ * @return an approximate lower bound rank.
197
+ */
198
+ double get_rank_lower_bound(double rank, uint8_t num_std_dev) const;
199
+
200
+ /**
201
+ * Returns an approximate upper bound of the given noramalized rank.
202
+ * @param rank the given rank, a value between 0 and 1.0.
203
+ * @param num_std_dev the number of standard deviations. Must be 1, 2, or 3.
204
+ * @return an approximate upper bound rank.
205
+ */
206
+ double get_rank_upper_bound(double rank, uint8_t num_std_dev) const;
207
+
208
+ /**
209
+ * Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]).
210
+ * Derived from Lemma 12 in https://arxiv.org/abs/2004.01668v2, but the constant factors were
211
+ * modified based on empirical measurements.
212
+ *
213
+ * @param k the given value of k
214
+ * @param rank the given normalized rank, a number in [0,1].
215
+ * @param hra if true High Rank Accuracy mode is being selected, otherwise, Low Rank Accuracy.
216
+ * @param n an estimate of the total number of items submitted to the sketch.
217
+ * @return an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]).
218
+ */
219
+ static double get_RSE(uint16_t k, double rank, bool hra, uint64_t n);
220
+
221
+ /**
222
+ * Computes size needed to serialize the current state of the sketch.
223
+ * This version is for fixed-size arithmetic types (integral and floating point).
224
+ * @return size in bytes needed to serialize this sketch
225
+ */
226
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
227
+ size_t get_serialized_size_bytes() const;
228
+
229
+ /**
230
+ * Computes size needed to serialize the current state of the sketch.
231
+ * This version is for all other types and can be expensive since every item needs to be looked at.
232
+ * @return size in bytes needed to serialize this sketch
233
+ */
234
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
235
+ size_t get_serialized_size_bytes() const;
236
+
237
+ /**
238
+ * This method serializes the sketch into a given stream in a binary form
239
+ * @param os output stream
240
+ */
241
+ void serialize(std::ostream& os) const;
242
+
243
+ // This is a convenience alias for users
244
+ // The type returned by the following serialize method
245
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
246
+
247
+ /**
248
+ * This method serializes the sketch as a vector of bytes.
249
+ * An optional header can be reserved in front of the sketch.
250
+ * It is a blank space of a given size.
251
+ * This header is used in Datasketches PostgreSQL extension.
252
+ * @param header_size_bytes space to reserve in front of the sketch
253
+ */
254
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
255
+
256
+ /**
257
+ * This method deserializes a sketch from a given stream.
258
+ * @param is input stream
259
+ * @return an instance of a sketch
260
+ */
261
+ static req_sketch deserialize(std::istream& is, const Allocator& allocator = Allocator());
262
+
263
+ /**
264
+ * This method deserializes a sketch from a given array of bytes.
265
+ * @param bytes pointer to the array of bytes
266
+ * @param size the size of the array
267
+ * @return an instance of a sketch
268
+ */
269
+ static req_sketch deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
270
+
271
+ /**
272
+ * Prints a summary of the sketch.
273
+ * @param print_levels if true include information about levels
274
+ * @param print_items if true include sketch data
275
+ */
276
+ string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
277
+
278
+ class const_iterator;
279
+ const_iterator begin() const;
280
+ const_iterator end() const;
281
+
282
+ private:
283
+ Allocator allocator_;
284
+ uint16_t k_;
285
+ bool hra_;
286
+ uint32_t max_nom_size_;
287
+ uint32_t num_retained_;
288
+ uint64_t n_;
289
+ std::vector<Compactor, AllocCompactor> compactors_;
290
+ T* min_value_;
291
+ T* max_value_;
292
+
293
+ static const bool LAZY_COMPRESSION = false;
294
+
295
+ static const uint8_t SERIAL_VERSION = 1;
296
+ static const uint8_t FAMILY = 17;
297
+ static const size_t PREAMBLE_SIZE_BYTES = 8;
298
+ enum flags { RESERVED1, RESERVED2, IS_EMPTY, IS_HIGH_RANK, RAW_ITEMS, IS_LEVEL_ZERO_SORTED };
299
+
300
+ static constexpr double FIXED_RSE_FACTOR = 0.084;
301
+ static double relative_rse_factor();
302
+
303
+ uint8_t get_num_levels() const;
304
+ void grow();
305
+ void update_max_nom_size();
306
+ void update_num_retained();
307
+ void compress();
308
+
309
+ static double get_rank_lb(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra);
310
+ static double get_rank_ub(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra);
311
+ static bool is_exact_rank(uint16_t k, uint8_t num_levels, double rank, uint64_t n, bool hra);
312
+
313
+ using QuantileCalculator = req_quantile_calculator<T, Comparator, Allocator>;
314
+ using AllocCalc = typename std::allocator_traits<Allocator>::template rebind_alloc<QuantileCalculator>;
315
+ class calculator_deleter;
316
+ using QuantileCalculatorPtr = typename std::unique_ptr<QuantileCalculator, calculator_deleter>;
317
+ template<bool inclusive>
318
+ QuantileCalculatorPtr get_quantile_calculator() const;
319
+
320
+ // for deserialization
321
+ class item_deleter;
322
+ req_sketch(uint32_t k, bool hra, uint64_t n, std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value, std::vector<Compactor, AllocCompactor>&& compactors);
323
+
324
+ static void check_preamble_ints(uint8_t preamble_ints, uint8_t num_levels);
325
+ static void check_serial_version(uint8_t serial_version);
326
+ static void check_family_id(uint8_t family_id);
327
+
328
+ // implementations for floating point types
329
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
330
+ static const TT& get_invalid_value() {
331
+ static TT value = std::numeric_limits<TT>::quiet_NaN();
332
+ return value;
333
+ }
334
+
335
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
336
+ static inline bool check_update_value(const TT& value) {
337
+ return !std::isnan(value);
338
+ }
339
+
340
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
341
+ static inline void check_split_points(const T* values, uint32_t size) {
342
+ for (uint32_t i = 0; i < size ; i++) {
343
+ if (std::isnan(values[i])) {
344
+ throw std::invalid_argument("Values must not be NaN");
345
+ }
346
+ if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
347
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
348
+ }
349
+ }
350
+ }
351
+
352
+ // implementations for all other types
353
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
354
+ static const TT& get_invalid_value() {
355
+ throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
356
+ }
357
+
358
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
359
+ static inline bool check_update_value(const TT&) {
360
+ return true;
361
+ }
362
+
363
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
364
+ static inline void check_split_points(const T* values, uint32_t size) {
365
+ for (uint32_t i = 0; i < size ; i++) {
366
+ if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
367
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
368
+ }
369
+ }
370
+ }
371
+
372
+ };
373
+
374
+ template<typename T, typename C, typename S, typename A>
375
+ class req_sketch<T, C, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
376
+ public:
377
+ const_iterator& operator++();
378
+ const_iterator& operator++(int);
379
+ bool operator==(const const_iterator& other) const;
380
+ bool operator!=(const const_iterator& other) const;
381
+ std::pair<const T&, const uint64_t> operator*() const;
382
+ private:
383
+ using LevelsIterator = typename std::vector<Compactor, AllocCompactor>::const_iterator;
384
+ LevelsIterator levels_it_;
385
+ LevelsIterator levels_end_;
386
+ const T* compactor_it_;
387
+ friend class req_sketch<T, C, S, A>;
388
+ const_iterator(LevelsIterator begin, LevelsIterator end);
389
+ };
390
+
391
+ } /* namespace datasketches */
392
+
393
+ #include "req_sketch_impl.hpp"
394
+
395
+ #endif