datasketches 0.1.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
- data/vendor/datasketches-cpp/python/README.md +52 -49
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
- data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
- data/vendor/datasketches-cpp/setup.py +11 -6
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
- metadata +51 -36
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -20,45 +20,29 @@
|
|
20
20
|
#ifndef THETA_SKETCH_HPP_
|
21
21
|
#define THETA_SKETCH_HPP_
|
22
22
|
|
23
|
-
#include
|
24
|
-
#include <functional>
|
25
|
-
#include <climits>
|
26
|
-
#include <vector>
|
27
|
-
|
28
|
-
#include "common_defs.hpp"
|
23
|
+
#include "theta_update_sketch_base.hpp"
|
29
24
|
|
30
25
|
namespace datasketches {
|
31
26
|
|
32
|
-
|
33
|
-
* author Alexander Saydakov
|
34
|
-
* author Lee Rhodes
|
35
|
-
* author Kevin Lang
|
36
|
-
*/
|
37
|
-
|
38
|
-
// forward-declarations
|
39
|
-
template<typename A> class theta_sketch_alloc;
|
40
|
-
template<typename A> class update_theta_sketch_alloc;
|
41
|
-
template<typename A> class compact_theta_sketch_alloc;
|
42
|
-
template<typename A> class theta_union_alloc;
|
43
|
-
template<typename A> class theta_intersection_alloc;
|
44
|
-
template<typename A> class theta_a_not_b_alloc;
|
45
|
-
|
46
|
-
// for serialization as raw bytes
|
47
|
-
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
48
|
-
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
49
|
-
|
50
|
-
template<typename A>
|
27
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
51
28
|
class theta_sketch_alloc {
|
52
29
|
public:
|
53
|
-
|
54
|
-
|
30
|
+
using Entry = uint64_t;
|
31
|
+
using ExtractKey = trivial_extract_key;
|
32
|
+
using iterator = theta_iterator<Entry, ExtractKey>;
|
33
|
+
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
55
34
|
|
56
35
|
virtual ~theta_sketch_alloc() = default;
|
57
36
|
|
37
|
+
/**
|
38
|
+
* @return allocator
|
39
|
+
*/
|
40
|
+
virtual Allocator get_allocator() const = 0;
|
41
|
+
|
58
42
|
/**
|
59
43
|
* @return true if this sketch represents an empty set (not the same as no retained entries!)
|
60
44
|
*/
|
61
|
-
bool is_empty() const;
|
45
|
+
virtual bool is_empty() const = 0;
|
62
46
|
|
63
47
|
/**
|
64
48
|
* @return estimate of the distinct count of the input stream
|
@@ -96,13 +80,16 @@ public:
|
|
96
80
|
/**
|
97
81
|
* @return theta as a positive integer between 0 and LLONG_MAX
|
98
82
|
*/
|
99
|
-
uint64_t get_theta64() const;
|
83
|
+
virtual uint64_t get_theta64() const = 0;
|
100
84
|
|
101
85
|
/**
|
102
86
|
* @return the number of retained entries in the sketch
|
103
87
|
*/
|
104
88
|
virtual uint32_t get_num_retained() const = 0;
|
105
89
|
|
90
|
+
/**
|
91
|
+
* @return hash of the seed that was used to hash the input
|
92
|
+
*/
|
106
93
|
virtual uint16_t get_seed_hash() const = 0;
|
107
94
|
|
108
95
|
/**
|
@@ -111,109 +98,82 @@ public:
|
|
111
98
|
virtual bool is_ordered() const = 0;
|
112
99
|
|
113
100
|
/**
|
114
|
-
*
|
101
|
+
* Provides a human-readable summary of this sketch as a string
|
115
102
|
* @param print_items if true include the list of items retained by the sketch
|
103
|
+
* @return sketch summary as a string
|
116
104
|
*/
|
117
|
-
virtual string<
|
105
|
+
virtual string<Allocator> to_string(bool print_items = false) const;
|
118
106
|
|
119
107
|
/**
|
120
|
-
*
|
121
|
-
* @
|
122
|
-
*/
|
123
|
-
virtual void serialize(std::ostream& os) const = 0;
|
124
|
-
|
125
|
-
// This is a convenience alias for users
|
126
|
-
// The type returned by the following serialize method
|
127
|
-
typedef vector_u8<A> vector_bytes;
|
128
|
-
|
129
|
-
/**
|
130
|
-
* This method serializes the sketch as a vector of bytes.
|
131
|
-
* An optional header can be reserved in front of the sketch.
|
132
|
-
* It is an uninitialized space of a given size.
|
133
|
-
* This header is used in Datasketches PostgreSQL extension.
|
134
|
-
* @param header_size_bytes space to reserve in front of the sketch
|
135
|
-
*/
|
136
|
-
virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
|
137
|
-
|
138
|
-
// This is a convenience alias for users
|
139
|
-
// The type returned by the following deserialize methods
|
140
|
-
// It is not possible to return instances of an abstract type, so this has to be a pointer
|
141
|
-
typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
|
142
|
-
|
143
|
-
/**
|
144
|
-
* This method deserializes a sketch from a given stream.
|
145
|
-
* @param is input stream
|
146
|
-
* @param seed the seed for the hash function that was used to create the sketch
|
147
|
-
* @return an instance of a sketch as a unique_ptr
|
108
|
+
* Iterator over hash values in this sketch.
|
109
|
+
* @return begin iterator
|
148
110
|
*/
|
149
|
-
|
111
|
+
virtual iterator begin() = 0;
|
150
112
|
|
151
113
|
/**
|
152
|
-
*
|
153
|
-
*
|
154
|
-
* @
|
155
|
-
* @param seed the seed for the hash function that was used to create the sketch
|
156
|
-
* @return an instance of the sketch
|
114
|
+
* Iterator pointing past the valid range.
|
115
|
+
* Not to be incremented or dereferenced.
|
116
|
+
* @return end iterator
|
157
117
|
*/
|
158
|
-
|
159
|
-
|
160
|
-
class const_iterator;
|
118
|
+
virtual iterator end() = 0;
|
161
119
|
|
162
120
|
/**
|
163
|
-
*
|
121
|
+
* Const iterator over hash values in this sketch.
|
164
122
|
* @return begin iterator
|
165
123
|
*/
|
166
124
|
virtual const_iterator begin() const = 0;
|
167
125
|
|
168
126
|
/**
|
169
|
-
*
|
127
|
+
* Const iterator pointing past the valid range.
|
170
128
|
* Not to be incremented or dereferenced.
|
171
129
|
* @return end iterator
|
172
130
|
*/
|
173
131
|
virtual const_iterator end() const = 0;
|
174
132
|
|
175
133
|
protected:
|
176
|
-
|
177
|
-
|
178
|
-
bool is_empty_;
|
179
|
-
uint64_t theta_;
|
180
|
-
|
181
|
-
theta_sketch_alloc(bool is_empty, uint64_t theta);
|
182
|
-
|
183
|
-
static uint16_t get_seed_hash(uint64_t seed);
|
184
|
-
|
185
|
-
static void check_sketch_type(uint8_t actual, uint8_t expected);
|
186
|
-
static void check_serial_version(uint8_t actual, uint8_t expected);
|
187
|
-
static void check_seed_hash(uint16_t actual, uint16_t expected);
|
188
|
-
|
189
|
-
friend theta_intersection_alloc<A>;
|
190
|
-
friend theta_a_not_b_alloc<A>;
|
134
|
+
using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
|
135
|
+
virtual void print_specifics(ostrstream& os) const = 0;
|
191
136
|
};
|
192
137
|
|
193
|
-
//
|
194
|
-
|
195
|
-
template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
|
196
|
-
template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
|
138
|
+
// forward declaration
|
139
|
+
template<typename A> class compact_theta_sketch_alloc;
|
197
140
|
|
198
|
-
template<typename
|
199
|
-
class update_theta_sketch_alloc: public theta_sketch_alloc<
|
141
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
142
|
+
class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
|
200
143
|
public:
|
201
|
-
|
202
|
-
|
203
|
-
|
144
|
+
using Base = theta_sketch_alloc<Allocator>;
|
145
|
+
using Entry = typename Base::Entry;
|
146
|
+
using ExtractKey = typename Base::ExtractKey;
|
147
|
+
using iterator = typename Base::iterator;
|
148
|
+
using const_iterator = typename Base::const_iterator;
|
149
|
+
using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
|
150
|
+
using resize_factor = typename theta_table::resize_factor;
|
204
151
|
|
205
152
|
// No constructor here. Use builder instead.
|
153
|
+
class builder;
|
206
154
|
|
155
|
+
update_theta_sketch_alloc(const update_theta_sketch_alloc&) = default;
|
156
|
+
update_theta_sketch_alloc(update_theta_sketch_alloc&&) noexcept = default;
|
207
157
|
virtual ~update_theta_sketch_alloc() = default;
|
158
|
+
update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc&) = default;
|
159
|
+
update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&&) = default;
|
208
160
|
|
209
|
-
virtual
|
210
|
-
virtual
|
161
|
+
virtual Allocator get_allocator() const;
|
162
|
+
virtual bool is_empty() const;
|
211
163
|
virtual bool is_ordered() const;
|
212
|
-
virtual
|
213
|
-
virtual
|
214
|
-
|
215
|
-
|
216
|
-
|
164
|
+
virtual uint16_t get_seed_hash() const;
|
165
|
+
virtual uint64_t get_theta64() const;
|
166
|
+
virtual uint32_t get_num_retained() const;
|
167
|
+
|
168
|
+
/**
|
169
|
+
* @return configured nominal number of entries in the sketch
|
170
|
+
*/
|
171
|
+
uint8_t get_lg_k() const;
|
172
|
+
|
173
|
+
/**
|
174
|
+
* @return configured resize factor of the sketch
|
175
|
+
*/
|
176
|
+
resize_factor get_rf() const;
|
217
177
|
|
218
178
|
/**
|
219
179
|
* Update this sketch with a given string.
|
@@ -302,7 +262,7 @@ public:
|
|
302
262
|
* @param data pointer to the data
|
303
263
|
* @param length of the data in bytes
|
304
264
|
*/
|
305
|
-
void update(const void* data,
|
265
|
+
void update(const void* data, size_t length);
|
306
266
|
|
307
267
|
/**
|
308
268
|
* Remove retained entries in excess of the nominal size k (if any)
|
@@ -314,105 +274,86 @@ public:
|
|
314
274
|
* @param ordered optional flag to specify if ordered sketch should be produced
|
315
275
|
* @return compact sketch
|
316
276
|
*/
|
317
|
-
compact_theta_sketch_alloc<
|
318
|
-
|
319
|
-
virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
|
320
|
-
virtual typename theta_sketch_alloc<A>::const_iterator end() const;
|
321
|
-
|
322
|
-
/**
|
323
|
-
* This method deserializes a sketch from a given stream.
|
324
|
-
* @param is input stream
|
325
|
-
* @param seed the seed for the hash function that was used to create the sketch
|
326
|
-
* @return an instance of a sketch
|
327
|
-
*/
|
328
|
-
static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
|
277
|
+
compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
|
329
278
|
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
* @param seed the seed for the hash function that was used to create the sketch
|
335
|
-
* @return an instance of the sketch
|
336
|
-
*/
|
337
|
-
static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
|
279
|
+
virtual iterator begin();
|
280
|
+
virtual iterator end();
|
281
|
+
virtual const_iterator begin() const;
|
282
|
+
virtual const_iterator end() const;
|
338
283
|
|
339
284
|
private:
|
340
|
-
|
341
|
-
static constexpr double RESIZE_THRESHOLD = 0.5;
|
342
|
-
// hash table rebuild threshold = 15/16
|
343
|
-
static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
|
344
|
-
|
345
|
-
static constexpr uint8_t STRIDE_HASH_BITS = 7;
|
346
|
-
static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
|
347
|
-
|
348
|
-
uint8_t lg_cur_size_;
|
349
|
-
uint8_t lg_nom_size_;
|
350
|
-
vector_u64<A> keys_;
|
351
|
-
uint32_t num_keys_;
|
352
|
-
resize_factor rf_;
|
353
|
-
float p_;
|
354
|
-
uint64_t seed_;
|
355
|
-
uint32_t capacity_;
|
285
|
+
theta_table table_;
|
356
286
|
|
357
287
|
// for builder
|
358
|
-
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
359
|
-
|
360
|
-
// for deserialize
|
361
|
-
update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
|
362
|
-
|
363
|
-
void resize();
|
364
|
-
void rebuild();
|
288
|
+
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
|
289
|
+
uint64_t seed, const Allocator& allocator);
|
365
290
|
|
366
|
-
|
367
|
-
void
|
368
|
-
|
369
|
-
friend theta_intersection_alloc<A>;
|
370
|
-
friend theta_a_not_b_alloc<A>;
|
371
|
-
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
|
372
|
-
static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
|
373
|
-
static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
|
374
|
-
static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
|
375
|
-
|
376
|
-
friend theta_sketch_alloc<A>;
|
377
|
-
static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
|
378
|
-
static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
|
291
|
+
using ostrstream = typename Base::ostrstream;
|
292
|
+
virtual void print_specifics(ostrstream& os) const;
|
379
293
|
};
|
380
294
|
|
381
295
|
// compact sketch
|
382
296
|
|
383
|
-
template<typename
|
384
|
-
class compact_theta_sketch_alloc: public theta_sketch_alloc<
|
297
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
298
|
+
class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
|
385
299
|
public:
|
300
|
+
using Base = theta_sketch_alloc<Allocator>;
|
301
|
+
using iterator = typename Base::iterator;
|
302
|
+
using const_iterator = typename Base::const_iterator;
|
303
|
+
using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
|
304
|
+
using vector_bytes = std::vector<uint8_t, AllocBytes>;
|
305
|
+
|
306
|
+
static const uint8_t SERIAL_VERSION = 3;
|
386
307
|
static const uint8_t SKETCH_TYPE = 3;
|
387
308
|
|
388
|
-
// No constructor here.
|
389
309
|
// Instances of this type can be obtained:
|
390
|
-
// - by compacting an
|
310
|
+
// - by compacting an update_theta_sketch_alloc
|
391
311
|
// - as a result of a set operation
|
392
312
|
// - by deserializing a previously serialized compact sketch
|
393
313
|
|
394
|
-
|
314
|
+
template<typename Other>
|
315
|
+
compact_theta_sketch_alloc(const Other& other, bool ordered);
|
316
|
+
compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
|
317
|
+
compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
|
395
318
|
virtual ~compact_theta_sketch_alloc() = default;
|
319
|
+
compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc&) = default;
|
320
|
+
compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&&) = default;
|
396
321
|
|
322
|
+
virtual Allocator get_allocator() const;
|
323
|
+
virtual bool is_empty() const;
|
324
|
+
virtual bool is_ordered() const;
|
325
|
+
virtual uint64_t get_theta64() const;
|
397
326
|
virtual uint32_t get_num_retained() const;
|
398
327
|
virtual uint16_t get_seed_hash() const;
|
399
|
-
virtual bool is_ordered() const;
|
400
|
-
virtual string<A> to_string(bool print_items = false) const;
|
401
|
-
virtual void serialize(std::ostream& os) const;
|
402
|
-
typedef vector_u8<A> vector_bytes; // alias for users
|
403
|
-
// header space is reserved, but not initialized
|
404
|
-
virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
405
328
|
|
406
|
-
|
407
|
-
|
329
|
+
/**
|
330
|
+
* This method serializes the sketch into a given stream in a binary form
|
331
|
+
* @param os output stream
|
332
|
+
*/
|
333
|
+
void serialize(std::ostream& os) const;
|
334
|
+
|
335
|
+
/**
|
336
|
+
* This method serializes the sketch as a vector of bytes.
|
337
|
+
* An optional header can be reserved in front of the sketch.
|
338
|
+
* It is an uninitialized space of a given size.
|
339
|
+
* This header is used in Datasketches PostgreSQL extension.
|
340
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
341
|
+
*/
|
342
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
343
|
+
|
344
|
+
virtual iterator begin();
|
345
|
+
virtual iterator end();
|
346
|
+
virtual const_iterator begin() const;
|
347
|
+
virtual const_iterator end() const;
|
408
348
|
|
409
349
|
/**
|
410
350
|
* This method deserializes a sketch from a given stream.
|
411
351
|
* @param is input stream
|
412
352
|
* @param seed the seed for the hash function that was used to create the sketch
|
413
|
-
* @return an instance of
|
353
|
+
* @return an instance of the sketch
|
414
354
|
*/
|
415
|
-
static compact_theta_sketch_alloc
|
355
|
+
static compact_theta_sketch_alloc deserialize(std::istream& is,
|
356
|
+
uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
416
357
|
|
417
358
|
/**
|
418
359
|
* This method deserializes a sketch from a given array of bytes.
|
@@ -421,110 +362,76 @@ public:
|
|
421
362
|
* @param seed the seed for the hash function that was used to create the sketch
|
422
363
|
* @return an instance of the sketch
|
423
364
|
*/
|
424
|
-
static compact_theta_sketch_alloc
|
365
|
+
static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
|
366
|
+
uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
367
|
+
|
368
|
+
// for internal use
|
369
|
+
compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
|
425
370
|
|
426
371
|
private:
|
427
|
-
|
372
|
+
enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
|
428
373
|
|
429
|
-
|
430
|
-
uint16_t seed_hash_;
|
374
|
+
bool is_empty_;
|
431
375
|
bool is_ordered_;
|
376
|
+
uint16_t seed_hash_;
|
377
|
+
uint64_t theta_;
|
378
|
+
std::vector<uint64_t, Allocator> entries_;
|
432
379
|
|
433
|
-
|
434
|
-
|
435
|
-
friend theta_union_alloc<A>;
|
436
|
-
friend theta_intersection_alloc<A>;
|
437
|
-
friend theta_a_not_b_alloc<A>;
|
438
|
-
compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
|
439
|
-
static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
|
440
|
-
static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
|
380
|
+
using ostrstream = typename Base::ostrstream;
|
381
|
+
virtual void print_specifics(ostrstream& os) const;
|
441
382
|
};
|
442
383
|
|
443
|
-
|
444
|
-
|
445
|
-
template<typename A>
|
446
|
-
class update_theta_sketch_alloc<A>::builder {
|
384
|
+
template<typename Allocator>
|
385
|
+
class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
|
447
386
|
public:
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
/**
|
453
|
-
* Creates and instance of the builder with default parameters.
|
454
|
-
*/
|
455
|
-
builder();
|
387
|
+
builder(const Allocator& allocator = Allocator());
|
388
|
+
update_theta_sketch_alloc build() const;
|
389
|
+
};
|
456
390
|
|
457
|
-
|
458
|
-
|
459
|
-
* @param lg_k base 2 logarithm of nominal number of entries
|
460
|
-
* @return this builder
|
461
|
-
*/
|
462
|
-
builder& set_lg_k(uint8_t lg_k);
|
391
|
+
// This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
|
392
|
+
// It does not take the ownership of the buffer.
|
463
393
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
*/
|
469
|
-
builder& set_resize_factor(resize_factor rf);
|
394
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
395
|
+
class wrapped_compact_theta_sketch_alloc {
|
396
|
+
public:
|
397
|
+
using const_iterator = const uint64_t*;
|
470
398
|
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
*/
|
478
|
-
builder& set_p(float p);
|
399
|
+
Allocator get_allocator() const;
|
400
|
+
bool is_empty() const;
|
401
|
+
bool is_ordered() const;
|
402
|
+
uint64_t get_theta64() const;
|
403
|
+
uint32_t get_num_retained() const;
|
404
|
+
uint16_t get_seed_hash() const;
|
479
405
|
|
480
|
-
|
481
|
-
|
482
|
-
* Sketches produced with different seed are not compatible
|
483
|
-
* and cannot be mixed in set operations.
|
484
|
-
* @param seed hash seed
|
485
|
-
* @return this builder
|
486
|
-
*/
|
487
|
-
builder& set_seed(uint64_t seed);
|
406
|
+
const_iterator begin() const;
|
407
|
+
const_iterator end() const;
|
488
408
|
|
489
409
|
/**
|
490
|
-
* This
|
491
|
-
* @
|
410
|
+
* This method wraps a serialized compact sketch as an array of bytes.
|
411
|
+
* @param bytes pointer to the array of bytes
|
412
|
+
* @param size the size of the array
|
413
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
414
|
+
* @return an instance of the sketch
|
492
415
|
*/
|
493
|
-
|
416
|
+
static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
|
494
417
|
|
495
418
|
private:
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
};
|
503
|
-
|
504
|
-
// iterator
|
505
|
-
template<typename A>
|
506
|
-
class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
|
507
|
-
public:
|
508
|
-
const_iterator& operator++();
|
509
|
-
const_iterator operator++(int);
|
510
|
-
bool operator==(const const_iterator& other) const;
|
511
|
-
bool operator!=(const const_iterator& other) const;
|
512
|
-
uint64_t operator*() const;
|
419
|
+
bool is_empty_;
|
420
|
+
bool is_ordered_;
|
421
|
+
uint16_t seed_hash_;
|
422
|
+
uint32_t num_entries_;
|
423
|
+
uint64_t theta_;
|
424
|
+
const uint64_t* entries_;
|
513
425
|
|
514
|
-
|
515
|
-
|
516
|
-
uint32_t size_;
|
517
|
-
uint32_t index_;
|
518
|
-
const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
|
519
|
-
friend class update_theta_sketch_alloc<A>;
|
520
|
-
friend class compact_theta_sketch_alloc<A>;
|
426
|
+
wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
|
427
|
+
uint64_t theta, const uint64_t* entries);
|
521
428
|
};
|
522
429
|
|
523
|
-
|
524
430
|
// aliases with default allocator for convenience
|
525
|
-
|
526
|
-
|
527
|
-
|
431
|
+
using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
|
432
|
+
using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
|
433
|
+
using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
434
|
+
using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
528
435
|
|
529
436
|
} /* namespace datasketches */
|
530
437
|
|