datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef COUNT_MIN_HPP_
|
|
21
|
+
#define COUNT_MIN_HPP_
|
|
22
|
+
|
|
23
|
+
#include <iterator>
|
|
24
|
+
#include "common_defs.hpp"
|
|
25
|
+
|
|
26
|
+
namespace datasketches {
|
|
27
|
+
|
|
28
|
+
/*
|
|
29
|
+
* C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan.
|
|
30
|
+
* [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
|
|
31
|
+
* The template type W is the type of the vector that contains the weights of the objects inserted into the sketch,
|
|
32
|
+
* not the type of the input items themselves.
|
|
33
|
+
* @author Charlie Dickens
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
template <typename W,
|
|
37
|
+
typename Allocator = std::allocator<W>>
|
|
38
|
+
class count_min_sketch{
|
|
39
|
+
static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected");
|
|
40
|
+
public:
|
|
41
|
+
using allocator_type = Allocator;
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed, `seed`.
|
|
45
|
+
* @param num_hashes : number of hash functions in the sketch. Equivalently the number of rows in the array
|
|
46
|
+
* @param num_buckets : number of buckets that hash functions map into. Equivalently the number of columns in the array
|
|
47
|
+
* @param seed for hash function
|
|
48
|
+
*
|
|
49
|
+
* The items inserted into the sketch can be arbitrary type, so long as they are hashable via murmurhash.
|
|
50
|
+
* Only update and estimate methods are added for uint64_t and string types.
|
|
51
|
+
*/
|
|
52
|
+
count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator()) ;
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* @return configured _num_hashes of this sketch
|
|
56
|
+
*/
|
|
57
|
+
uint8_t get_num_hashes() const;
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* @return configured _num_buckets of this sketch
|
|
61
|
+
*/
|
|
62
|
+
uint32_t get_num_buckets() const;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* @return configured seed of this sketch
|
|
66
|
+
*/
|
|
67
|
+
uint64_t get_seed() const;
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* @return epsilon : double
|
|
71
|
+
* The maximum permissible error for any frequency estimate query.
|
|
72
|
+
* epsilon = ceil(e / _num_buckets)
|
|
73
|
+
*/
|
|
74
|
+
double get_relative_error() const;
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* @return _total_weight : typename W
|
|
78
|
+
* The total weight currently inserted into the stream.
|
|
79
|
+
*/
|
|
80
|
+
W get_total_weight() const;
|
|
81
|
+
|
|
82
|
+
/*
|
|
83
|
+
* @param relative_error : double -- the desired accuracy within which estimates should lie.
|
|
84
|
+
* For example, when relative_error = 0.05, then the returned frequency estimates satisfy the
|
|
85
|
+
* `relative_error` guarantee that never overestimates the weights but may underestimate the weights
|
|
86
|
+
* by 5% of the total weight in the sketch.
|
|
87
|
+
* @return number_of_buckets : the number of hash buckets at every level of the
|
|
88
|
+
* sketch required in order to obtain the specified relative error.
|
|
89
|
+
* [1] - Section 3 ``Data Structure'', page 6.
|
|
90
|
+
*/
|
|
91
|
+
static uint32_t suggest_num_buckets(double relative_error) ;
|
|
92
|
+
|
|
93
|
+
/*
|
|
94
|
+
* @param confidence : double -- the desired confidence with which estimates should be correct.
|
|
95
|
+
* For example, with 95% confidence, frequency estimates satisfy the `relative_error` guarantee.
|
|
96
|
+
* @return number_of_hashes : the number of hash functions that are required in
|
|
97
|
+
* order to achieve the specified confidence of the sketch.
|
|
98
|
+
* confidence = 1 - delta, with delta denoting the sketch failure probability in the literature.
|
|
99
|
+
* [1] - Section 3 ``Data Structure'', page 6.
|
|
100
|
+
*/
|
|
101
|
+
static uint8_t suggest_num_hashes(double confidence) ;
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Specific get_estimate function for uint64_t type
|
|
105
|
+
* see generic get_estimate function
|
|
106
|
+
* @param item : uint64_t type.
|
|
107
|
+
* @return an estimate of the item's frequency.
|
|
108
|
+
*/
|
|
109
|
+
W get_estimate(uint64_t item) const ;
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Specific get_estimate function for int64_t type
|
|
113
|
+
* see generic get_estimate function
|
|
114
|
+
* @param item : uint64_t type.
|
|
115
|
+
* @return an estimate of the item's frequency.
|
|
116
|
+
*/
|
|
117
|
+
W get_estimate(int64_t item) const ;
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Specific get_estimate function for std::string type
|
|
121
|
+
* see generic get_estimate function
|
|
122
|
+
* @param item : std::string type
|
|
123
|
+
* @return an estimate of the item's frequency.
|
|
124
|
+
*/
|
|
125
|
+
W get_estimate(const std::string& item) const;
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* This is the generic estimate query function for any of the given datatypes.
|
|
129
|
+
* Query the sketch for the estimate of a given item.
|
|
130
|
+
* @param item : pointer to the data item to be query from the sketch.
|
|
131
|
+
* @param size : size_t
|
|
132
|
+
* @return the estimated frequency of the item denoted f_est satisfying
|
|
133
|
+
* f_true - relative_error*_total_weight <= f_est <= f_true
|
|
134
|
+
*/
|
|
135
|
+
W get_estimate(const void* item, size_t size) const ;
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Query the sketch for the upper bound of a given item.
|
|
139
|
+
* @param item : uint64_t or std::string to query
|
|
140
|
+
* @return the upper bound on the true frequency of the item
|
|
141
|
+
* f_true <= f_est + relative_error*_total_weight
|
|
142
|
+
*/
|
|
143
|
+
W get_upper_bound(const void* item, size_t size) const;
|
|
144
|
+
W get_upper_bound(int64_t) const ;
|
|
145
|
+
W get_upper_bound(uint64_t) const ;
|
|
146
|
+
W get_upper_bound(const std::string& item) const;
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Query the sketch for the lower bound of a given item.
|
|
150
|
+
* @param item : uint64_t or std::string to query
|
|
151
|
+
* @return the lower bound for the query result, f_est, on the true frequency, f_est of the item
|
|
152
|
+
* f_true - relative_error*_total_weight <= f_est
|
|
153
|
+
*/
|
|
154
|
+
W get_lower_bound(const void* item, size_t size) const ;
|
|
155
|
+
W get_lower_bound(int64_t) const ;
|
|
156
|
+
W get_lower_bound(uint64_t) const ;
|
|
157
|
+
W get_lower_bound(const std::string& item) const ;
|
|
158
|
+
|
|
159
|
+
/*
|
|
160
|
+
* Update this sketch with given data of any type.
|
|
161
|
+
* This is a "universal" update that covers all cases above,
|
|
162
|
+
* but may produce different hashes.
|
|
163
|
+
* @param item pointer to the data item to be inserted into the sketch.
|
|
164
|
+
* @param size of the data in bytes
|
|
165
|
+
* @return vector of uint64_t which each represent the index to which `value' must update in the sketch
|
|
166
|
+
*/
|
|
167
|
+
void update(const void* item, size_t size, W weight) ;
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Update this sketch with a given uint64_t item.
|
|
171
|
+
* @param item : uint64_t to update the sketch with
|
|
172
|
+
* @param weight : arithmetic type
|
|
173
|
+
* void function which inserts an item of type uint64_t into the sketch
|
|
174
|
+
*/
|
|
175
|
+
void update(uint64_t item, W weight) ;
|
|
176
|
+
void update(uint64_t item) ;
|
|
177
|
+
void update(int64_t item, W weight) ;
|
|
178
|
+
void update(int64_t item) ;
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Update this sketch with a given string.
|
|
182
|
+
* @param item : string to update the sketch with
|
|
183
|
+
* @param weight : arithmetic type
|
|
184
|
+
* void function which inserts an item of type std::string into the sketch
|
|
185
|
+
*/
|
|
186
|
+
void update(const std::string& item, W weight) ;
|
|
187
|
+
void update(const std::string& item) ;
|
|
188
|
+
|
|
189
|
+
/*
|
|
190
|
+
* merges a separate count_min_sketch into this count_min_sketch.
|
|
191
|
+
*/
|
|
192
|
+
void merge(const count_min_sketch &other_sketch) ;
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Returns true if this sketch is empty.
|
|
196
|
+
* A Count Min Sketch is defined to be empty iff weight == 0
|
|
197
|
+
* This can only ever happen if all items inserted to the sketch have weights that cancel each other out.
|
|
198
|
+
* @return empty flag
|
|
199
|
+
*/
|
|
200
|
+
bool is_empty() const ;
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* @brief Returns a string describing the sketch
|
|
204
|
+
* @return A string with a human-readable description of the sketch
|
|
205
|
+
*/
|
|
206
|
+
string<Allocator> to_string() const;
|
|
207
|
+
|
|
208
|
+
// Iterators
|
|
209
|
+
using const_iterator = typename std::vector<W, Allocator>::const_iterator ;
|
|
210
|
+
const_iterator begin() const;
|
|
211
|
+
const_iterator end() const;
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* This method serializes the sketch into a given stream in a binary form
|
|
215
|
+
* @param os output stream
|
|
216
|
+
* The byte output has the following structure
|
|
217
|
+
* Byte 0:
|
|
218
|
+
* 1 - if and only if the sketch is empty
|
|
219
|
+
* 0 - otherwise
|
|
220
|
+
*
|
|
221
|
+
* Byte 1 (serial version), byte 2 (family id), byte 3 (flags):
|
|
222
|
+
* 00000001 - default for now.
|
|
223
|
+
*
|
|
224
|
+
* Bytes 4 - 7:
|
|
225
|
+
* uint8_t zero corresponding to ``empty''
|
|
226
|
+
*
|
|
227
|
+
* Byte 8:
|
|
228
|
+
* uint_8 for number of hash functions
|
|
229
|
+
*
|
|
230
|
+
* Bytes 9, 13
|
|
231
|
+
* 4 bytes : uint32 for number of buckets.
|
|
232
|
+
*
|
|
233
|
+
* Bytes 14, 15:
|
|
234
|
+
* seed_hash
|
|
235
|
+
*
|
|
236
|
+
* Byte 16:
|
|
237
|
+
* uint8_t zero corresponding to ``empty''
|
|
238
|
+
*
|
|
239
|
+
* All remaining bytes from 17-24 follow the pattern of
|
|
240
|
+
* Bytes 17-24:
|
|
241
|
+
* Sketch array entry
|
|
242
|
+
*
|
|
243
|
+
|
|
244
|
+
0 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
|
245
|
+
||is_empty|ser__ver|familyId| flags |xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
|
|
246
|
+
|
|
247
|
+
1 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
|
248
|
+
||---------- _num_buckets -----------|num_hash|__seed__ __hash__|xxxxxxxx|
|
|
249
|
+
|
|
250
|
+
2 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
|
251
|
+
||---------------------------- total weight ----------------------------|
|
|
252
|
+
|
|
253
|
+
3 || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
|
254
|
+
||---------------------------- sketch entries ---------------------------|
|
|
255
|
+
...
|
|
256
|
+
|
|
257
|
+
*
|
|
258
|
+
*
|
|
259
|
+
*/
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Computes size needed to serialize the current state of the sketch.
|
|
264
|
+
* @return size in bytes needed to serialize this sketch
|
|
265
|
+
*/
|
|
266
|
+
size_t get_serialized_size_bytes() const;
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* This method serializes a binary image of the sketch to an output stream.
|
|
270
|
+
*/
|
|
271
|
+
void serialize(std::ostream& os) const;
|
|
272
|
+
|
|
273
|
+
// This is a convenience alias for users
|
|
274
|
+
// The type returned by the following serialize method
|
|
275
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
|
|
276
|
+
|
|
277
|
+
/**
|
|
278
|
+
* This method serializes the sketch as a vector of bytes.
|
|
279
|
+
* An optional header can be reserved in front of the sketch.
|
|
280
|
+
* It is an uninitialized space of a given size.
|
|
281
|
+
* This header is used in Datasketches PostgreSQL extension.
|
|
282
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
|
283
|
+
*/
|
|
284
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* This method deserializes a sketch from a given stream.
|
|
288
|
+
* @param is input stream
|
|
289
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
290
|
+
* @return an instance of a sketch
|
|
291
|
+
*/
|
|
292
|
+
static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* This method deserializes a sketch from a given array of bytes.
|
|
296
|
+
* @param bytes pointer to the array of bytes
|
|
297
|
+
* @param size the size of the array
|
|
298
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
299
|
+
* @return an instance of the sketch
|
|
300
|
+
*/
|
|
301
|
+
static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* Returns the allocator for this sketch.
|
|
305
|
+
* @return allocator
|
|
306
|
+
*/
|
|
307
|
+
allocator_type get_allocator() const;
|
|
308
|
+
|
|
309
|
+
private:
|
|
310
|
+
Allocator _allocator;
|
|
311
|
+
uint8_t _num_hashes ;
|
|
312
|
+
uint32_t _num_buckets ;
|
|
313
|
+
std::vector<W, Allocator> _sketch_array ; // the array stored by the sketch
|
|
314
|
+
uint64_t _seed ;
|
|
315
|
+
W _total_weight ;
|
|
316
|
+
std::vector<uint64_t> hash_seeds ;
|
|
317
|
+
|
|
318
|
+
enum flags {IS_EMPTY};
|
|
319
|
+
static const uint8_t PREAMBLE_LONGS_SHORT = 2; // Empty -> need second byte for sketch parameters
|
|
320
|
+
static const uint8_t PREAMBLE_LONGS_FULL = 3; // Not empty -> need (at least) third byte for total weight.
|
|
321
|
+
static const uint8_t SERIAL_VERSION_1 = 1;
|
|
322
|
+
static const uint8_t FAMILY_ID = 18;
|
|
323
|
+
static const uint8_t NULL_8 = 0;
|
|
324
|
+
static const uint32_t NULL_32 = 0;
|
|
325
|
+
|
|
326
|
+
/**
|
|
327
|
+
* Throws an error if the header is not valid.
|
|
328
|
+
* @param preamble_longs
|
|
329
|
+
* @param serial_version
|
|
330
|
+
* @param flags_byte
|
|
331
|
+
*/
|
|
332
|
+
static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte);
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
/*
|
|
338
|
+
* Obtain the hash values when inserting an item into the sketch.
|
|
339
|
+
* @param item pointer to the data item to be inserted into the sketch.
|
|
340
|
+
* @param size of the data in bytes
|
|
341
|
+
* @return vector of uint64_t which each represent the index to which `value' must update in the sketch
|
|
342
|
+
*/
|
|
343
|
+
std::vector<uint64_t> get_hashes(const void* item, size_t size) const;
|
|
344
|
+
|
|
345
|
+
};
|
|
346
|
+
|
|
347
|
+
} /* namespace datasketches */
|
|
348
|
+
|
|
349
|
+
#include "count_min_impl.hpp"
|
|
350
|
+
|
|
351
|
+
#endif
|