datasketches 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -1,53 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #ifndef THETA_A_NOT_B_EXPERIMENTAL_HPP_
21
- #define THETA_A_NOT_B_EXPERIMENTAL_HPP_
22
-
23
- #include "theta_sketch_experimental.hpp"
24
- #include "theta_set_difference_base.hpp"
25
-
26
- namespace datasketches {
27
-
28
- template<typename Allocator = std::allocator<uint64_t>>
29
- class theta_a_not_b_experimental {
30
- public:
31
- using Entry = uint64_t;
32
- using ExtractKey = trivial_extract_key;
33
- using CompactSketch = compact_theta_sketch_experimental<Allocator>;
34
- using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
35
-
36
- explicit theta_a_not_b_experimental(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
37
-
38
- /**
39
- * Computes the a-not-b set operation given two sketches.
40
- * @return the result of a-not-b
41
- */
42
- template<typename FwdSketch, typename Sketch>
43
- CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
44
-
45
- private:
46
- State state_;
47
- };
48
-
49
- } /* namespace datasketches */
50
-
51
- #include "theta_a_not_b_experimental_impl.hpp"
52
-
53
- #endif
@@ -1,78 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #ifndef THETA_INTERSECTION_EXPERIMENTAL_HPP_
21
- #define THETA_INTERSECTION_EXPERIMENTAL_HPP_
22
-
23
- #include "theta_sketch_experimental.hpp"
24
- #include "theta_intersection_base.hpp"
25
-
26
- namespace datasketches {
27
-
28
- template<typename Allocator = std::allocator<uint64_t>>
29
- class theta_intersection_experimental {
30
- public:
31
- using Entry = uint64_t;
32
- using ExtractKey = trivial_extract_key;
33
- using Sketch = theta_sketch_experimental<Allocator>;
34
- using CompactSketch = compact_theta_sketch_experimental<Allocator>;
35
-
36
- struct pass_through_policy {
37
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
- unused(incoming_entry);
39
- return internal_entry;
40
- }
41
- };
42
- using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
43
-
44
- explicit theta_intersection_experimental(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
45
-
46
- /**
47
- * Updates the intersection with a given sketch.
48
- * The intersection can be viewed as starting from the "universe" set, and every update
49
- * can reduce the current set to leave the overlapping subset only.
50
- * @param sketch represents input set for the intersection
51
- */
52
- template<typename FwdSketch>
53
- void update(FwdSketch&& sketch);
54
-
55
- /**
56
- * Produces a copy of the current state of the intersection.
57
- * If update() was not called, the state is the infinite "universe",
58
- * which is considered an undefined state, and throws an exception.
59
- * @param ordered optional flag to specify if ordered sketch should be produced
60
- * @return the result of the intersection
61
- */
62
- CompactSketch get_result(bool ordered = true) const;
63
-
64
- /**
65
- * Returns true if the state of the intersection is defined (not infinite "universe").
66
- * @return true if the state is valid
67
- */
68
- bool has_result() const;
69
-
70
- private:
71
- State state_;
72
- };
73
-
74
- } /* namespace datasketches */
75
-
76
- #include "theta_intersection_experimental_impl.hpp"
77
-
78
- #endif
@@ -1,43 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- namespace datasketches {
21
-
22
- template<typename A>
23
- theta_intersection_experimental<A>::theta_intersection_experimental(uint64_t seed, const A& allocator):
24
- state_(seed, pass_through_policy(), allocator)
25
- {}
26
-
27
- template<typename A>
28
- template<typename SS>
29
- void theta_intersection_experimental<A>::update(SS&& sketch) {
30
- state_.update(std::forward<SS>(sketch));
31
- }
32
-
33
- template<typename A>
34
- auto theta_intersection_experimental<A>::get_result(bool ordered) const -> CompactSketch {
35
- return state_.get_result(ordered);
36
- }
37
-
38
- template<typename A>
39
- bool theta_intersection_experimental<A>::has_result() const {
40
- return state_.has_result();
41
- }
42
-
43
- } /* namespace datasketches */
@@ -1,393 +0,0 @@
1
- /*
2
- * Licensed to the Apache Software Foundation (ASF) under one
3
- * or more contributor license agreements. See the NOTICE file
4
- * distributed with this work for additional information
5
- * regarding copyright ownership. The ASF licenses this file
6
- * to you under the Apache License, Version 2.0 (the
7
- * "License"); you may not use this file except in compliance
8
- * with the License. You may obtain a copy of the License at
9
- *
10
- * http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing,
13
- * software distributed under the License is distributed on an
14
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- * KIND, either express or implied. See the License for the
16
- * specific language governing permissions and limitations
17
- * under the License.
18
- */
19
-
20
- #ifndef THETA_SKETCH_EXPERIMENTAL_HPP_
21
- #define THETA_SKETCH_EXPERIMENTAL_HPP_
22
-
23
- #include "theta_update_sketch_base.hpp"
24
-
25
- namespace datasketches {
26
-
27
- // experimental theta sketch derived from the same base as tuple sketch
28
-
29
- template<typename Allocator = std::allocator<uint64_t>>
30
- class theta_sketch_experimental {
31
- public:
32
- using Entry = uint64_t;
33
- using ExtractKey = trivial_extract_key;
34
- using iterator = theta_iterator<Entry, ExtractKey>;
35
- using const_iterator = theta_const_iterator<Entry, ExtractKey>;
36
-
37
- virtual ~theta_sketch_experimental() = default;
38
-
39
- /**
40
- * @return allocator
41
- */
42
- virtual Allocator get_allocator() const = 0;
43
-
44
- /**
45
- * @return true if this sketch represents an empty set (not the same as no retained entries!)
46
- */
47
- virtual bool is_empty() const = 0;
48
-
49
- /**
50
- * @return estimate of the distinct count of the input stream
51
- */
52
- double get_estimate() const;
53
-
54
- /**
55
- * Returns the approximate lower error bound given a number of standard deviations.
56
- * This parameter is similar to the number of standard deviations of the normal distribution
57
- * and corresponds to approximately 67%, 95% and 99% confidence intervals.
58
- * @param num_std_devs number of Standard Deviations (1, 2 or 3)
59
- * @return the lower bound
60
- */
61
- double get_lower_bound(uint8_t num_std_devs) const;
62
-
63
- /**
64
- * Returns the approximate upper error bound given a number of standard deviations.
65
- * This parameter is similar to the number of standard deviations of the normal distribution
66
- * and corresponds to approximately 67%, 95% and 99% confidence intervals.
67
- * @param num_std_devs number of Standard Deviations (1, 2 or 3)
68
- * @return the upper bound
69
- */
70
- double get_upper_bound(uint8_t num_std_devs) const;
71
-
72
- /**
73
- * @return true if the sketch is in estimation mode (as opposed to exact mode)
74
- */
75
- bool is_estimation_mode() const;
76
-
77
- /**
78
- * @return theta as a fraction from 0 to 1 (effective sampling rate)
79
- */
80
- double get_theta() const;
81
-
82
- /**
83
- * @return theta as a positive integer between 0 and LLONG_MAX
84
- */
85
- virtual uint64_t get_theta64() const = 0;
86
-
87
- /**
88
- * @return the number of retained entries in the sketch
89
- */
90
- virtual uint32_t get_num_retained() const = 0;
91
-
92
- /**
93
- * @return hash of the seed that was used to hash the input
94
- */
95
- virtual uint16_t get_seed_hash() const = 0;
96
-
97
- /**
98
- * @return true if retained entries are ordered
99
- */
100
- virtual bool is_ordered() const = 0;
101
-
102
- /**
103
- * Provides a human-readable summary of this sketch as a string
104
- * @param print_items if true include the list of items retained by the sketch
105
- * @return sketch summary as a string
106
- */
107
- virtual string<Allocator> to_string(bool print_items = false) const;
108
-
109
- /**
110
- * Iterator over hash values in this sketch.
111
- * @return begin iterator
112
- */
113
- virtual iterator begin() = 0;
114
-
115
- /**
116
- * Iterator pointing past the valid range.
117
- * Not to be incremented or dereferenced.
118
- * @return end iterator
119
- */
120
- virtual iterator end() = 0;
121
-
122
- /**
123
- * Const iterator over hash values in this sketch.
124
- * @return begin iterator
125
- */
126
- virtual const_iterator begin() const = 0;
127
-
128
- /**
129
- * Const iterator pointing past the valid range.
130
- * Not to be incremented or dereferenced.
131
- * @return end iterator
132
- */
133
- virtual const_iterator end() const = 0;
134
-
135
- protected:
136
- virtual void print_specifics(std::ostringstream& os) const = 0;
137
- };
138
-
139
- // forward declaration
140
- template<typename A> class compact_theta_sketch_experimental;
141
-
142
- template<typename Allocator = std::allocator<uint64_t>>
143
- class update_theta_sketch_experimental: public theta_sketch_experimental<Allocator> {
144
- public:
145
- using Base = theta_sketch_experimental<Allocator>;
146
- using Entry = typename Base::Entry;
147
- using ExtractKey = typename Base::ExtractKey;
148
- using iterator = typename Base::iterator;
149
- using const_iterator = typename Base::const_iterator;
150
- using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
151
- using resize_factor = typename theta_table::resize_factor;
152
-
153
- // No constructor here. Use builder instead.
154
- class builder;
155
-
156
- update_theta_sketch_experimental(const update_theta_sketch_experimental&) = default;
157
- update_theta_sketch_experimental(update_theta_sketch_experimental&&) noexcept = default;
158
- virtual ~update_theta_sketch_experimental() = default;
159
- update_theta_sketch_experimental& operator=(const update_theta_sketch_experimental&) = default;
160
- update_theta_sketch_experimental& operator=(update_theta_sketch_experimental&&) = default;
161
-
162
- virtual Allocator get_allocator() const;
163
- virtual bool is_empty() const;
164
- virtual bool is_ordered() const;
165
- virtual uint16_t get_seed_hash() const;
166
- virtual uint64_t get_theta64() const;
167
- virtual uint32_t get_num_retained() const;
168
-
169
- /**
170
- * @return configured nominal number of entries in the sketch
171
- */
172
- uint8_t get_lg_k() const;
173
-
174
- /**
175
- * @return configured resize factor of the sketch
176
- */
177
- resize_factor get_rf() const;
178
-
179
- /**
180
- * Update this sketch with a given string.
181
- * @param value string to update the sketch with
182
- */
183
- void update(const std::string& value);
184
-
185
- /**
186
- * Update this sketch with a given unsigned 64-bit integer.
187
- * @param value uint64_t to update the sketch with
188
- */
189
- void update(uint64_t value);
190
-
191
- /**
192
- * Update this sketch with a given signed 64-bit integer.
193
- * @param value int64_t to update the sketch with
194
- */
195
- void update(int64_t value);
196
-
197
- /**
198
- * Update this sketch with a given unsigned 32-bit integer.
199
- * For compatibility with Java implementation.
200
- * @param value uint32_t to update the sketch with
201
- */
202
- void update(uint32_t value);
203
-
204
- /**
205
- * Update this sketch with a given signed 32-bit integer.
206
- * For compatibility with Java implementation.
207
- * @param value int32_t to update the sketch with
208
- */
209
- void update(int32_t value);
210
-
211
- /**
212
- * Update this sketch with a given unsigned 16-bit integer.
213
- * For compatibility with Java implementation.
214
- * @param value uint16_t to update the sketch with
215
- */
216
- void update(uint16_t value);
217
-
218
- /**
219
- * Update this sketch with a given signed 16-bit integer.
220
- * For compatibility with Java implementation.
221
- * @param value int16_t to update the sketch with
222
- */
223
- void update(int16_t value);
224
-
225
- /**
226
- * Update this sketch with a given unsigned 8-bit integer.
227
- * For compatibility with Java implementation.
228
- * @param value uint8_t to update the sketch with
229
- */
230
- void update(uint8_t value);
231
-
232
- /**
233
- * Update this sketch with a given signed 8-bit integer.
234
- * For compatibility with Java implementation.
235
- * @param value int8_t to update the sketch with
236
- */
237
- void update(int8_t value);
238
-
239
- /**
240
- * Update this sketch with a given double-precision floating point value.
241
- * For compatibility with Java implementation.
242
- * @param value double to update the sketch with
243
- */
244
- void update(double value);
245
-
246
- /**
247
- * Update this sketch with a given floating point value.
248
- * For compatibility with Java implementation.
249
- * @param value float to update the sketch with
250
- */
251
- void update(float value);
252
-
253
- /**
254
- * Update this sketch with given data of any type.
255
- * This is a "universal" update that covers all cases above,
256
- * but may produce different hashes.
257
- * Be very careful to hash input values consistently using the same approach
258
- * both over time and on different platforms
259
- * and while passing sketches between C++ environment and Java environment.
260
- * Otherwise two sketches that should represent overlapping sets will be disjoint
261
- * For instance, for signed 32-bit values call update(int32_t) method above,
262
- * which does widening conversion to int64_t, if compatibility with Java is expected
263
- * @param data pointer to the data
264
- * @param length of the data in bytes
265
- */
266
- void update(const void* data, size_t length);
267
-
268
- /**
269
- * Remove retained entries in excess of the nominal size k (if any)
270
- */
271
- void trim();
272
-
273
- /**
274
- * Converts this sketch to a compact sketch (ordered or unordered).
275
- * @param ordered optional flag to specify if ordered sketch should be produced
276
- * @return compact sketch
277
- */
278
- compact_theta_sketch_experimental<Allocator> compact(bool ordered = true) const;
279
-
280
- virtual iterator begin();
281
- virtual iterator end();
282
- virtual const_iterator begin() const;
283
- virtual const_iterator end() const;
284
-
285
- private:
286
- theta_table table_;
287
-
288
- // for builder
289
- update_theta_sketch_experimental(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
290
- uint64_t seed, const Allocator& allocator);
291
-
292
- virtual void print_specifics(std::ostringstream& os) const;
293
- };
294
-
295
- // compact sketch
296
-
297
- template<typename Allocator = std::allocator<uint64_t>>
298
- class compact_theta_sketch_experimental: public theta_sketch_experimental<Allocator> {
299
- public:
300
- using Base = theta_sketch_experimental<Allocator>;
301
- using iterator = typename Base::iterator;
302
- using const_iterator = typename Base::const_iterator;
303
- using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
304
- using vector_bytes = std::vector<uint8_t, AllocBytes>;
305
-
306
- static const uint8_t SERIAL_VERSION = 3;
307
- static const uint8_t SKETCH_TYPE = 3;
308
-
309
- // Instances of this type can be obtained:
310
- // - by compacting an update_theta_sketch
311
- // - as a result of a set operation
312
- // - by deserializing a previously serialized compact sketch
313
-
314
- compact_theta_sketch_experimental(const Base& other, bool ordered);
315
- compact_theta_sketch_experimental(const compact_theta_sketch_experimental&) = default;
316
- compact_theta_sketch_experimental(compact_theta_sketch_experimental&&) noexcept = default;
317
- virtual ~compact_theta_sketch_experimental() = default;
318
- compact_theta_sketch_experimental& operator=(const compact_theta_sketch_experimental&) = default;
319
- compact_theta_sketch_experimental& operator=(compact_theta_sketch_experimental&&) = default;
320
-
321
- virtual Allocator get_allocator() const;
322
- virtual bool is_empty() const;
323
- virtual bool is_ordered() const;
324
- virtual uint64_t get_theta64() const;
325
- virtual uint32_t get_num_retained() const;
326
- virtual uint16_t get_seed_hash() const;
327
-
328
- /**
329
- * This method serializes the sketch into a given stream in a binary form
330
- * @param os output stream
331
- */
332
- void serialize(std::ostream& os) const;
333
-
334
- /**
335
- * This method serializes the sketch as a vector of bytes.
336
- * An optional header can be reserved in front of the sketch.
337
- * It is an uninitialized space of a given size.
338
- * This header is used in Datasketches PostgreSQL extension.
339
- * @param header_size_bytes space to reserve in front of the sketch
340
- */
341
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
342
-
343
- virtual iterator begin();
344
- virtual iterator end();
345
- virtual const_iterator begin() const;
346
- virtual const_iterator end() const;
347
-
348
- /**
349
- * This method deserializes a sketch from a given stream.
350
- * @param is input stream
351
- * @param seed the seed for the hash function that was used to create the sketch
352
- * @return an instance of the sketch
353
- */
354
- static compact_theta_sketch_experimental deserialize(std::istream& is,
355
- uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
356
-
357
- /**
358
- * This method deserializes a sketch from a given array of bytes.
359
- * @param bytes pointer to the array of bytes
360
- * @param size the size of the array
361
- * @param seed the seed for the hash function that was used to create the sketch
362
- * @return an instance of the sketch
363
- */
364
- static compact_theta_sketch_experimental deserialize(const void* bytes, size_t size,
365
- uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
366
-
367
- // for internal use
368
- compact_theta_sketch_experimental(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
369
-
370
- private:
371
- enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
372
-
373
- bool is_empty_;
374
- bool is_ordered_;
375
- uint16_t seed_hash_;
376
- uint64_t theta_;
377
- std::vector<uint64_t, Allocator> entries_;
378
-
379
- virtual void print_specifics(std::ostringstream& os) const;
380
- };
381
-
382
- template<typename Allocator>
383
- class update_theta_sketch_experimental<Allocator>::builder: public theta_base_builder<builder, Allocator> {
384
- public:
385
- builder(const Allocator& allocator = Allocator());
386
- update_theta_sketch_experimental build() const;
387
- };
388
-
389
- } /* namespace datasketches */
390
-
391
- #include "theta_sketch_experimental_impl.hpp"
392
-
393
- #endif