datasketches 0.1.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  6. data/ext/datasketches/ext.cpp +1 -1
  7. data/ext/datasketches/ext.h +4 -0
  8. data/ext/datasketches/extconf.rb +1 -1
  9. data/ext/datasketches/fi_wrapper.cpp +6 -8
  10. data/ext/datasketches/hll_wrapper.cpp +13 -14
  11. data/ext/datasketches/kll_wrapper.cpp +28 -76
  12. data/ext/datasketches/theta_wrapper.cpp +27 -41
  13. data/ext/datasketches/vo_wrapper.cpp +4 -6
  14. data/lib/datasketches/version.rb +1 -1
  15. data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
  16. data/vendor/datasketches-cpp/LICENSE +40 -3
  17. data/vendor/datasketches-cpp/NOTICE +1 -1
  18. data/vendor/datasketches-cpp/README.md +4 -4
  19. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
  20. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  21. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  24. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  25. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  26. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  27. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  28. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  29. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
  31. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
  32. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
  33. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
  34. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
  35. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  36. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
  37. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
  38. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  42. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  43. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  46. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  47. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
  48. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  49. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
  50. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  51. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
  52. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  53. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
  54. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
  55. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  56. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
  57. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
  58. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
  59. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
  60. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
  61. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  62. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  63. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  64. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
  76. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
  77. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
  78. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
  79. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  80. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  81. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  82. data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
  83. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
  84. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
  85. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
  86. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  87. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
  88. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
  89. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  90. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  91. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  92. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  93. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  94. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  95. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
  96. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
  97. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
  98. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  99. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  100. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  101. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  102. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
  103. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  104. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  105. data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
  106. data/vendor/datasketches-cpp/python/README.md +52 -49
  107. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  108. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  109. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  110. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
  111. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
  112. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  113. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
  114. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  115. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  116. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
  117. data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
  118. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  119. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  120. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  121. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  122. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  123. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
  124. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  125. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
  126. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  127. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  128. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  129. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  130. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  131. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  132. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  133. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  134. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  135. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  136. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  137. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  138. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
  139. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
  140. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  141. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
  142. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  143. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  144. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  145. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
  146. data/vendor/datasketches-cpp/setup.py +11 -6
  147. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  148. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
  149. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  150. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  151. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  152. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  153. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  154. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  155. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
  156. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  157. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
  158. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  159. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  160. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  161. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  162. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
  163. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  164. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  165. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
  166. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
  167. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  168. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  169. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
  170. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  171. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
  172. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
  173. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  174. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  175. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  176. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
  177. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
  178. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  179. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  180. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  181. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  182. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  183. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
  184. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  185. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  186. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
  187. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
  188. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
  189. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
  190. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  191. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
  192. metadata +51 -36
  193. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  194. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  195. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  196. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  197. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  198. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  199. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  200. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  201. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  202. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  203. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  204. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  205. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -0,0 +1,60 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef REQ_QUANTILE_CALCULATOR_IMPL_HPP_
21
+ #define REQ_QUANTILE_CALCULATOR_IMPL_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ template<typename T, typename C, typename A>
26
+ req_quantile_calculator<T, C, A>::req_quantile_calculator(uint64_t n, const A& allocator):
27
+ n_(n),
28
+ entries_(allocator)
29
+ {}
30
+
31
+ template<typename T, typename C, typename A>
32
+ void req_quantile_calculator<T, C, A>::add(const T* begin, const T* end, uint8_t lg_weight) {
33
+ if (entries_.capacity() < entries_.size() + std::distance(begin, end)) entries_.reserve(entries_.size() + std::distance(begin, end));
34
+ const size_t size_before = entries_.size();
35
+ for (auto it = begin; it != end; ++it) entries_.push_back(Entry(it, 1 << lg_weight));
36
+ if (size_before > 0) std::inplace_merge(entries_.begin(), entries_.begin() + size_before, entries_.end(), compare_pairs_by_first_ptr<C>());
37
+ }
38
+
39
+ template<typename T, typename C, typename A>
40
+ template<bool inclusive>
41
+ void req_quantile_calculator<T, C, A>::convert_to_cummulative() {
42
+ uint64_t subtotal = 0;
43
+ for (auto& entry: entries_) {
44
+ const uint64_t new_subtotal = subtotal + entry.second;
45
+ entry.second = inclusive ? new_subtotal : subtotal;
46
+ subtotal = new_subtotal;
47
+ }
48
+ }
49
+
50
+ template<typename T, typename C, typename A>
51
+ const T* req_quantile_calculator<T, C, A>::get_quantile(double rank) const {
52
+ uint64_t weight = static_cast<uint64_t>(rank * n_);
53
+ auto it = std::lower_bound(entries_.begin(), entries_.end(), Entry(nullptr, weight), compare_pairs_by_second());
54
+ if (it == entries_.end()) return entries_[entries_.size() - 1].first;
55
+ return it->first;
56
+ }
57
+
58
+ } /* namespace datasketches */
59
+
60
+ #endif
@@ -0,0 +1,395 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef REQ_SKETCH_HPP_
21
+ #define REQ_SKETCH_HPP_
22
+
23
+ #include "req_common.hpp"
24
+ #include "req_compactor.hpp"
25
+ #include "req_quantile_calculator.hpp"
26
+
27
+ namespace datasketches {
28
+
29
+ template<
30
+ typename T,
31
+ typename Comparator = std::less<T>,
32
+ typename SerDe = serde<T>,
33
+ typename Allocator = std::allocator<T>
34
+ >
35
+ class req_sketch {
36
+ public:
37
+ using Compactor = req_compactor<T, Comparator, Allocator>;
38
+ using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>;
39
+ using AllocDouble = typename std::allocator_traits<Allocator>::template rebind_alloc<double>;
40
+ using vector_double = std::vector<double, AllocDouble>;
41
+
42
+ /**
43
+ * Constructor
44
+ * @param k Controls the size and error of the sketch. It must be even and in the range [4, 1024], inclusive.
45
+ * Value of 12 roughly corresponds to 1% relative error guarantee at 95% confidence.
46
+ * @param hra if true, the default, the high ranks are prioritized for better
47
+ * accuracy. Otherwise the low ranks are prioritized for better accuracy.
48
+ * @param allocator to use by this instance
49
+ */
50
+ explicit req_sketch(uint16_t k, bool hra = true, const Allocator& allocator = Allocator());
51
+
52
+ ~req_sketch();
53
+ req_sketch(const req_sketch& other);
54
+ req_sketch(req_sketch&& other) noexcept;
55
+ req_sketch& operator=(const req_sketch& other);
56
+ req_sketch& operator=(req_sketch&& other);
57
+
58
+ /**
59
+ * Returns configured parameter K
60
+ * @return parameter K
61
+ */
62
+ uint16_t get_k() const;
63
+
64
+ /**
65
+ * Returns configured parameter High Rank Accuracy
66
+ * @return parameter HRA
67
+ */
68
+ bool is_HRA() const;
69
+
70
+ /**
71
+ * Returns true if this sketch is empty.
72
+ * @return empty flag
73
+ */
74
+ bool is_empty() const;
75
+
76
+ /**
77
+ * Returns the length of the input stream.
78
+ * @return stream length
79
+ */
80
+ uint64_t get_n() const;
81
+
82
+ /**
83
+ * Returns the number of retained items in the sketch.
84
+ * @return number of retained items
85
+ */
86
+ uint32_t get_num_retained() const;
87
+
88
+ /**
89
+ * Returns true if this sketch is in estimation mode.
90
+ * @return estimation mode flag
91
+ */
92
+ bool is_estimation_mode() const;
93
+
94
+ template<typename FwdT>
95
+ void update(FwdT&& item);
96
+
97
+ template<typename FwdSk>
98
+ void merge(FwdSk&& other);
99
+
100
+ /**
101
+ * Returns the min value of the stream.
102
+ * For floating point types: if the sketch is empty this returns NaN.
103
+ * For other types: if the sketch is empty this throws runtime_error.
104
+ * @return the min value of the stream
105
+ */
106
+ const T& get_min_value() const;
107
+
108
+ /**
109
+ * Returns the max value of the stream.
110
+ * For floating point types: if the sketch is empty this returns NaN.
111
+ * For other types: if the sketch is empty this throws runtime_error.
112
+ * @return the max value of the stream
113
+ */
114
+ const T& get_max_value() const;
115
+
116
+ /**
117
+ * Returns an approximation to the normalized (fractional) rank of the given item from 0 to 1 inclusive.
118
+ * With the template parameter inclusive=true the weight of the given item is included into the rank.
119
+ * Otherwise the rank equals the sum of the weights of items less than the given item according to the Comparator.
120
+ *
121
+ * <p>If the sketch is empty this returns NaN.
122
+ *
123
+ * @param item to be ranked
124
+ * @return an approximate rank of the given item
125
+ */
126
+
127
+ template<bool inclusive = false>
128
+ double get_rank(const T& item) const;
129
+
130
+ /**
131
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
132
+ * given a set of split points (values).
133
+ *
134
+ * <p>If the sketch is empty this returns an empty vector.
135
+ *
136
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
137
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
138
+ * The definition of an "interval" is inclusive of the left split point (or minimum value) and
139
+ * exclusive of the right split point, with the exception that the last interval will include
140
+ * the maximum value.
141
+ * It is not necessary to include either the min or max values in these split points.
142
+ *
143
+ * @return an array of m+1 doubles each of which is an approximation
144
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
145
+ * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
146
+ * split point, with the exception that the last interval will include the maximum value.
147
+ * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
148
+ * split point.
149
+ */
150
+ template<bool inclusive = false>
151
+ vector_double get_PMF(const T* split_points, uint32_t size) const;
152
+
153
+ /**
154
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
155
+ * cumulative analog of the PMF, of the input stream given a set of split points (values).
156
+ *
157
+ * <p>If the sketch is empty this returns an empty vector.
158
+ *
159
+ * @param split_points an array of <i>m</i> unique, monotonically increasing float values
160
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
161
+ * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
162
+ * split point, with the exception that the last interval will include the maximum value.
163
+ * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
164
+ * split point.
165
+ * It is not necessary to include either the min or max values in these split points.
166
+ *
167
+ * @return an array of m+1 double values, which are a consecutive approximation to the CDF
168
+ * of the input stream given the split_points. The value at array position j of the returned
169
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
170
+ * array.
171
+ */
172
+ template<bool inclusive = false>
173
+ vector_double get_CDF(const T* split_points, uint32_t size) const;
174
+
175
+ /**
176
+ * Returns an approximate quantile of the given normalized rank.
177
+ * The normalized rank must be in the range [0.0, 1.0] (both inclusive).
178
+ * @param rank the given normalized rank
179
+ * @return approximate quantile given the normalized rank
180
+ */
181
+ template<bool inclusive = false>
182
+ const T& get_quantile(double rank) const;
183
+
184
+ /**
185
+ * Returns an array of quantiles that correspond to the given array of normalized ranks.
186
+ * @param ranks given array of normalized ranks.
187
+ * @return array of quantiles that correspond to the given array of normalized ranks
188
+ */
189
+ template<bool inclusive = false>
190
+ std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size) const;
191
+
192
+ /**
193
+ * Returns an approximate lower bound of the given noramalized rank.
194
+ * @param rank the given rank, a value between 0 and 1.0.
195
+ * @param num_std_dev the number of standard deviations. Must be 1, 2, or 3.
196
+ * @return an approximate lower bound rank.
197
+ */
198
+ double get_rank_lower_bound(double rank, uint8_t num_std_dev) const;
199
+
200
+ /**
201
+ * Returns an approximate upper bound of the given noramalized rank.
202
+ * @param rank the given rank, a value between 0 and 1.0.
203
+ * @param num_std_dev the number of standard deviations. Must be 1, 2, or 3.
204
+ * @return an approximate upper bound rank.
205
+ */
206
+ double get_rank_upper_bound(double rank, uint8_t num_std_dev) const;
207
+
208
+ /**
209
+ * Returns an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]).
210
+ * Derived from Lemma 12 in https://arxiv.org/abs/2004.01668v2, but the constant factors were
211
+ * modified based on empirical measurements.
212
+ *
213
+ * @param k the given value of k
214
+ * @param rank the given normalized rank, a number in [0,1].
215
+ * @param hra if true High Rank Accuracy mode is being selected, otherwise, Low Rank Accuracy.
216
+ * @param n an estimate of the total number of items submitted to the sketch.
217
+ * @return an a priori estimate of relative standard error (RSE, expressed as a number in [0,1]).
218
+ */
219
+ static double get_RSE(uint16_t k, double rank, bool hra, uint64_t n);
220
+
221
+ /**
222
+ * Computes size needed to serialize the current state of the sketch.
223
+ * This version is for fixed-size arithmetic types (integral and floating point).
224
+ * @return size in bytes needed to serialize this sketch
225
+ */
226
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
227
+ size_t get_serialized_size_bytes() const;
228
+
229
+ /**
230
+ * Computes size needed to serialize the current state of the sketch.
231
+ * This version is for all other types and can be expensive since every item needs to be looked at.
232
+ * @return size in bytes needed to serialize this sketch
233
+ */
234
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
235
+ size_t get_serialized_size_bytes() const;
236
+
237
+ /**
238
+ * This method serializes the sketch into a given stream in a binary form
239
+ * @param os output stream
240
+ */
241
+ void serialize(std::ostream& os) const;
242
+
243
+ // This is a convenience alias for users
244
+ // The type returned by the following serialize method
245
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
246
+
247
+ /**
248
+ * This method serializes the sketch as a vector of bytes.
249
+ * An optional header can be reserved in front of the sketch.
250
+ * It is a blank space of a given size.
251
+ * This header is used in Datasketches PostgreSQL extension.
252
+ * @param header_size_bytes space to reserve in front of the sketch
253
+ */
254
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
255
+
256
+ /**
257
+ * This method deserializes a sketch from a given stream.
258
+ * @param is input stream
259
+ * @return an instance of a sketch
260
+ */
261
+ static req_sketch deserialize(std::istream& is, const Allocator& allocator = Allocator());
262
+
263
+ /**
264
+ * This method deserializes a sketch from a given array of bytes.
265
+ * @param bytes pointer to the array of bytes
266
+ * @param size the size of the array
267
+ * @return an instance of a sketch
268
+ */
269
+ static req_sketch deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
270
+
271
+ /**
272
+ * Prints a summary of the sketch.
273
+ * @param print_levels if true include information about levels
274
+ * @param print_items if true include sketch data
275
+ */
276
+ string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
277
+
278
+ class const_iterator;
279
+ const_iterator begin() const;
280
+ const_iterator end() const;
281
+
282
+ private:
283
+ Allocator allocator_;
284
+ uint16_t k_;
285
+ bool hra_;
286
+ uint32_t max_nom_size_;
287
+ uint32_t num_retained_;
288
+ uint64_t n_;
289
+ std::vector<Compactor, AllocCompactor> compactors_;
290
+ T* min_value_;
291
+ T* max_value_;
292
+
293
+ static const bool LAZY_COMPRESSION = false;
294
+
295
+ static const uint8_t SERIAL_VERSION = 1;
296
+ static const uint8_t FAMILY = 17;
297
+ static const size_t PREAMBLE_SIZE_BYTES = 8;
298
+ enum flags { RESERVED1, RESERVED2, IS_EMPTY, IS_HIGH_RANK, RAW_ITEMS, IS_LEVEL_ZERO_SORTED };
299
+
300
+ static constexpr double FIXED_RSE_FACTOR = 0.084;
301
+ static double relative_rse_factor();
302
+
303
+ uint8_t get_num_levels() const;
304
+ void grow();
305
+ void update_max_nom_size();
306
+ void update_num_retained();
307
+ void compress();
308
+
309
+ static double get_rank_lb(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra);
310
+ static double get_rank_ub(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra);
311
+ static bool is_exact_rank(uint16_t k, uint8_t num_levels, double rank, uint64_t n, bool hra);
312
+
313
+ using QuantileCalculator = req_quantile_calculator<T, Comparator, Allocator>;
314
+ using AllocCalc = typename std::allocator_traits<Allocator>::template rebind_alloc<QuantileCalculator>;
315
+ class calculator_deleter;
316
+ using QuantileCalculatorPtr = typename std::unique_ptr<QuantileCalculator, calculator_deleter>;
317
+ template<bool inclusive>
318
+ QuantileCalculatorPtr get_quantile_calculator() const;
319
+
320
+ // for deserialization
321
+ class item_deleter;
322
+ req_sketch(uint16_t k, bool hra, uint64_t n, std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value, std::vector<Compactor, AllocCompactor>&& compactors);
323
+
324
+ static void check_preamble_ints(uint8_t preamble_ints, uint8_t num_levels);
325
+ static void check_serial_version(uint8_t serial_version);
326
+ static void check_family_id(uint8_t family_id);
327
+
328
+ // implementations for floating point types
329
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
330
+ static const TT& get_invalid_value() {
331
+ static TT value = std::numeric_limits<TT>::quiet_NaN();
332
+ return value;
333
+ }
334
+
335
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
336
+ static inline bool check_update_value(const TT& value) {
337
+ return !std::isnan(value);
338
+ }
339
+
340
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
341
+ static inline void check_split_points(const T* values, uint32_t size) {
342
+ for (uint32_t i = 0; i < size ; i++) {
343
+ if (std::isnan(values[i])) {
344
+ throw std::invalid_argument("Values must not be NaN");
345
+ }
346
+ if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
347
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
348
+ }
349
+ }
350
+ }
351
+
352
+ // implementations for all other types
353
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
354
+ static const TT& get_invalid_value() {
355
+ throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
356
+ }
357
+
358
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
359
+ static inline bool check_update_value(const TT&) {
360
+ return true;
361
+ }
362
+
363
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
364
+ static inline void check_split_points(const T* values, uint32_t size) {
365
+ for (uint32_t i = 0; i < size ; i++) {
366
+ if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
367
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
368
+ }
369
+ }
370
+ }
371
+
372
+ };
373
+
374
+ template<typename T, typename C, typename S, typename A>
375
+ class req_sketch<T, C, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
376
+ public:
377
+ const_iterator& operator++();
378
+ const_iterator& operator++(int);
379
+ bool operator==(const const_iterator& other) const;
380
+ bool operator!=(const const_iterator& other) const;
381
+ std::pair<const T&, const uint64_t> operator*() const;
382
+ private:
383
+ using LevelsIterator = typename std::vector<Compactor, AllocCompactor>::const_iterator;
384
+ LevelsIterator levels_it_;
385
+ LevelsIterator levels_end_;
386
+ const T* compactor_it_;
387
+ friend class req_sketch<T, C, S, A>;
388
+ const_iterator(LevelsIterator begin, LevelsIterator end);
389
+ };
390
+
391
+ } /* namespace datasketches */
392
+
393
+ #include "req_sketch_impl.hpp"
394
+
395
+ #endif