datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef COUNT_MIN_IMPL_HPP_
|
|
21
|
+
#define COUNT_MIN_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <algorithm>
|
|
24
|
+
#include <iomanip>
|
|
25
|
+
#include <random>
|
|
26
|
+
#include <sstream>
|
|
27
|
+
|
|
28
|
+
#include "MurmurHash3.h"
|
|
29
|
+
#include "count_min.hpp"
|
|
30
|
+
#include "memory_operations.hpp"
|
|
31
|
+
|
|
32
|
+
namespace datasketches {
|
|
33
|
+
|
|
34
|
+
template<typename W, typename A>
|
|
35
|
+
count_min_sketch<W,A>::count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed, const A& allocator):
|
|
36
|
+
_allocator(allocator),
|
|
37
|
+
_num_hashes(num_hashes),
|
|
38
|
+
_num_buckets(num_buckets),
|
|
39
|
+
_sketch_array((num_hashes*num_buckets < 1<<30) ? num_hashes*num_buckets : 0, 0, _allocator),
|
|
40
|
+
_seed(seed),
|
|
41
|
+
_total_weight(0){
|
|
42
|
+
if(num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.") ;
|
|
43
|
+
|
|
44
|
+
// This check is to ensure later compatibility with a Java implementation whose maximum size can only
|
|
45
|
+
// be 2^31-1. We check only against 2^30 for simplicity.
|
|
46
|
+
if(num_buckets*num_hashes >= 1<<30) {
|
|
47
|
+
throw std::invalid_argument("These parameters generate a sketch that exceeds 2^30 elements."
|
|
48
|
+
"Try reducing either the number of buckets or the number of hash functions.") ;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
std::default_random_engine rng(_seed);
|
|
52
|
+
std::uniform_int_distribution<uint64_t> extra_hash_seeds(0, std::numeric_limits<uint64_t>::max());
|
|
53
|
+
hash_seeds.reserve(num_hashes) ;
|
|
54
|
+
|
|
55
|
+
for(uint64_t i=0 ; i < num_hashes ; ++i){
|
|
56
|
+
hash_seeds.push_back(extra_hash_seeds(rng) + _seed); // Adds the global seed to all hash functions.
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
template<typename W, typename A>
|
|
61
|
+
uint8_t count_min_sketch<W,A>::get_num_hashes() const{
|
|
62
|
+
return _num_hashes ;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
template<typename W, typename A>
|
|
66
|
+
uint32_t count_min_sketch<W,A>::get_num_buckets() const{
|
|
67
|
+
return _num_buckets ;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
template<typename W, typename A>
|
|
71
|
+
uint64_t count_min_sketch<W,A>::get_seed() const {
|
|
72
|
+
return _seed ;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
template<typename W, typename A>
|
|
76
|
+
double count_min_sketch<W,A>::get_relative_error() const{
|
|
77
|
+
return exp(1.0) / double(_num_buckets) ;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
template<typename W, typename A>
|
|
81
|
+
W count_min_sketch<W,A>::get_total_weight() const{
|
|
82
|
+
return _total_weight ;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
template<typename W, typename A>
|
|
86
|
+
uint32_t count_min_sketch<W,A>::suggest_num_buckets(double relative_error){
|
|
87
|
+
/*
|
|
88
|
+
* Function to help users select a number of buckets for a given error.
|
|
89
|
+
* TODO: Change this when we use only power of 2 buckets.
|
|
90
|
+
*
|
|
91
|
+
*/
|
|
92
|
+
if(relative_error < 0.){
|
|
93
|
+
throw std::invalid_argument( "Relative error must be at least 0." );
|
|
94
|
+
}
|
|
95
|
+
return ceil(exp(1.0) / relative_error);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
template<typename W, typename A>
|
|
99
|
+
uint8_t count_min_sketch<W,A>::suggest_num_hashes(double confidence){
|
|
100
|
+
/*
|
|
101
|
+
* Function to help users select a number of hashes for a given confidence
|
|
102
|
+
* e.g. confidence = 1 - failure probability
|
|
103
|
+
* failure probability == delta in the literature.
|
|
104
|
+
*/
|
|
105
|
+
if(confidence < 0. || confidence > 1.0){
|
|
106
|
+
throw std::invalid_argument( "Confidence must be between 0 and 1.0 (inclusive)." );
|
|
107
|
+
}
|
|
108
|
+
return std::min<uint8_t>( ceil(log(1.0/(1.0 - confidence))), UINT8_MAX) ;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
template<typename W, typename A>
|
|
112
|
+
std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t size) const{
|
|
113
|
+
/*
|
|
114
|
+
* Returns the hash locations for the input item using the original hashing
|
|
115
|
+
* scheme from [1].
|
|
116
|
+
* Generate _num_hashes separate hashes from calls to murmurmhash.
|
|
117
|
+
* This could be optimized by keeping both of the 64bit parts of the hash
|
|
118
|
+
* function, rather than generating a new one for every level.
|
|
119
|
+
*
|
|
120
|
+
*
|
|
121
|
+
* Postscript.
|
|
122
|
+
* Note that a tradeoff can be achieved over the update time and space
|
|
123
|
+
* complexity of the sketch by using a combinatorial hashing scheme from
|
|
124
|
+
* https://github.com/Claudenw/BloomFilter/wiki/Bloom-Filters----An-overview
|
|
125
|
+
* https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
|
|
126
|
+
*/
|
|
127
|
+
uint64_t bucket_index ;
|
|
128
|
+
std::vector<uint64_t> sketch_update_locations; //(_num_hashes) ;
|
|
129
|
+
sketch_update_locations.reserve(_num_hashes) ;
|
|
130
|
+
|
|
131
|
+
uint64_t hash_seed_index = 0 ;
|
|
132
|
+
for(const auto &it : hash_seeds){
|
|
133
|
+
HashState hashes;
|
|
134
|
+
MurmurHash3_x64_128(item, size, it, hashes); // ? BEWARE OVERFLOW.
|
|
135
|
+
uint64_t hash = hashes.h1 ;
|
|
136
|
+
bucket_index = hash % _num_buckets ;
|
|
137
|
+
sketch_update_locations.push_back((hash_seed_index * _num_buckets) + bucket_index) ;
|
|
138
|
+
hash_seed_index += 1 ;
|
|
139
|
+
}
|
|
140
|
+
return sketch_update_locations ;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
template<typename W, typename A>
|
|
144
|
+
W count_min_sketch<W,A>::get_estimate(uint64_t item) const {return get_estimate(&item, sizeof(item));}
|
|
145
|
+
|
|
146
|
+
template<typename W, typename A>
|
|
147
|
+
W count_min_sketch<W,A>::get_estimate(int64_t item) const {return get_estimate(&item, sizeof(item));}
|
|
148
|
+
|
|
149
|
+
template<typename W, typename A>
|
|
150
|
+
W count_min_sketch<W,A>::get_estimate(const std::string& item) const {
|
|
151
|
+
if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch.
|
|
152
|
+
return get_estimate(item.c_str(), item.length());
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
template<typename W, typename A>
|
|
156
|
+
W count_min_sketch<W,A>::get_estimate(const void* item, size_t size) const {
|
|
157
|
+
/*
|
|
158
|
+
* Returns the estimated frequency of the item
|
|
159
|
+
*/
|
|
160
|
+
std::vector<uint64_t> hash_locations = get_hashes(item, size) ;
|
|
161
|
+
std::vector<W> estimates ;
|
|
162
|
+
for (auto h: hash_locations){
|
|
163
|
+
estimates.push_back(_sketch_array[h]) ;
|
|
164
|
+
}
|
|
165
|
+
W result = *std::min_element(estimates.begin(), estimates.end());
|
|
166
|
+
return result ;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
template<typename W, typename A>
|
|
170
|
+
void count_min_sketch<W,A>::update(uint64_t item, W weight) {
|
|
171
|
+
update(&item, sizeof(item), weight);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
template<typename W, typename A>
|
|
175
|
+
void count_min_sketch<W,A>::update(uint64_t item) {
|
|
176
|
+
update(&item, sizeof(item), 1);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
template<typename W, typename A>
|
|
180
|
+
void count_min_sketch<W,A>::update(int64_t item, W weight) {
|
|
181
|
+
update(&item, sizeof(item), weight);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
template<typename W, typename A>
|
|
185
|
+
void count_min_sketch<W,A>::update(int64_t item) {
|
|
186
|
+
update(&item, sizeof(item), 1);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
template<typename W, typename A>
|
|
190
|
+
void count_min_sketch<W,A>::update(const std::string& item, W weight) {
|
|
191
|
+
if (item.empty()) return;
|
|
192
|
+
update(item.c_str(), item.length(), weight);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
template<typename W, typename A>
|
|
196
|
+
void count_min_sketch<W,A>::update(const std::string& item) {
|
|
197
|
+
if (item.empty()) return;
|
|
198
|
+
update(item.c_str(), item.length(), 1);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
template<typename W, typename A>
|
|
202
|
+
void count_min_sketch<W,A>::update(const void* item, size_t size, W weight) {
|
|
203
|
+
/*
|
|
204
|
+
* Gets the item's hash locations and then increments the sketch in those
|
|
205
|
+
* locations by the weight.
|
|
206
|
+
*/
|
|
207
|
+
W magnitude = (weight >= 0) ? weight : -weight ;
|
|
208
|
+
_total_weight += magnitude ;
|
|
209
|
+
std::vector<uint64_t> hash_locations = get_hashes(item, size) ;
|
|
210
|
+
for (auto h: hash_locations){
|
|
211
|
+
_sketch_array[h] += weight ;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
template<typename W, typename A>
|
|
216
|
+
W count_min_sketch<W,A>::get_upper_bound(uint64_t item) const {return get_upper_bound(&item, sizeof(item));}
|
|
217
|
+
|
|
218
|
+
template<typename W, typename A>
|
|
219
|
+
W count_min_sketch<W,A>::get_upper_bound(int64_t item) const {return get_upper_bound(&item, sizeof(item));}
|
|
220
|
+
|
|
221
|
+
template<typename W, typename A>
|
|
222
|
+
W count_min_sketch<W,A>::get_upper_bound(const std::string& item) const {
|
|
223
|
+
if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch.
|
|
224
|
+
return get_upper_bound(item.c_str(), item.length());
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
template<typename W, typename A>
|
|
228
|
+
W count_min_sketch<W,A>::get_upper_bound(const void* item, size_t size) const {
|
|
229
|
+
return get_estimate(item, size) + get_relative_error()*get_total_weight() ;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
template<typename W, typename A>
|
|
233
|
+
W count_min_sketch<W,A>::get_lower_bound(uint64_t item) const {return get_lower_bound(&item, sizeof(item));}
|
|
234
|
+
|
|
235
|
+
template<typename W, typename A>
|
|
236
|
+
W count_min_sketch<W,A>::get_lower_bound(int64_t item) const {return get_lower_bound(&item, sizeof(item));}
|
|
237
|
+
|
|
238
|
+
template<typename W, typename A>
|
|
239
|
+
W count_min_sketch<W,A>::get_lower_bound(const std::string& item) const {
|
|
240
|
+
if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch.
|
|
241
|
+
return get_lower_bound(item.c_str(), item.length());
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
template<typename W, typename A>
|
|
245
|
+
W count_min_sketch<W,A>::get_lower_bound(const void* item, size_t size) const {
|
|
246
|
+
return get_estimate(item, size) ;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
template<typename W, typename A>
|
|
250
|
+
void count_min_sketch<W,A>::merge(const count_min_sketch &other_sketch){
|
|
251
|
+
/*
|
|
252
|
+
* Merges this sketch into other_sketch sketch by elementwise summing of buckets
|
|
253
|
+
*/
|
|
254
|
+
if(this == &other_sketch){
|
|
255
|
+
throw std::invalid_argument( "Cannot merge a sketch with itself." );
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
bool acceptable_config =
|
|
259
|
+
(get_num_hashes() == other_sketch.get_num_hashes()) &&
|
|
260
|
+
(get_num_buckets() == other_sketch.get_num_buckets()) &&
|
|
261
|
+
(get_seed() == other_sketch.get_seed()) ;
|
|
262
|
+
if(!acceptable_config){
|
|
263
|
+
throw std::invalid_argument( "Incompatible sketch configuration." );
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Merge step - iterate over the other vector and add the weights to this sketch
|
|
267
|
+
auto it = _sketch_array.begin() ; // This is a std::vector iterator.
|
|
268
|
+
auto other_it = other_sketch.begin() ; //This is a const iterator over the other sketch.
|
|
269
|
+
while(it != _sketch_array.end()){
|
|
270
|
+
*it += *other_it ;
|
|
271
|
+
++it ;
|
|
272
|
+
++other_it ;
|
|
273
|
+
}
|
|
274
|
+
_total_weight += other_sketch.get_total_weight() ;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Iterators
|
|
278
|
+
template<typename W, typename A>
|
|
279
|
+
typename count_min_sketch<W,A>::const_iterator count_min_sketch<W,A>::begin() const {
|
|
280
|
+
return _sketch_array.begin();
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
template<typename W, typename A>
|
|
284
|
+
typename count_min_sketch<W,A>::const_iterator count_min_sketch<W,A>::end() const {
|
|
285
|
+
return _sketch_array.end();
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
template<typename W, typename A>
|
|
289
|
+
void count_min_sketch<W,A>::serialize(std::ostream& os) const {
|
|
290
|
+
// Long 0
|
|
291
|
+
//const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
|
|
292
|
+
const uint8_t preamble_longs = PREAMBLE_LONGS_SHORT;
|
|
293
|
+
const uint8_t ser_ver = SERIAL_VERSION_1;
|
|
294
|
+
const uint8_t family_id = FAMILY_ID ;
|
|
295
|
+
const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
|
|
296
|
+
const uint32_t unused32 = NULL_32 ;
|
|
297
|
+
write(os, preamble_longs) ;
|
|
298
|
+
write(os, ser_ver) ;
|
|
299
|
+
write(os, family_id) ;
|
|
300
|
+
write(os, flags_byte) ;
|
|
301
|
+
write(os, unused32) ;
|
|
302
|
+
|
|
303
|
+
// Long 1
|
|
304
|
+
const uint32_t nbuckets = _num_buckets ;
|
|
305
|
+
const uint8_t nhashes = _num_hashes ;
|
|
306
|
+
const uint16_t seed_hash(compute_seed_hash(_seed));
|
|
307
|
+
const uint8_t unused8 = NULL_8;
|
|
308
|
+
write(os, nbuckets) ;
|
|
309
|
+
write(os, nhashes) ;
|
|
310
|
+
write(os, seed_hash) ;
|
|
311
|
+
write(os, unused8) ;
|
|
312
|
+
if (is_empty()) return ; // sketch is empty, no need to write further bytes.
|
|
313
|
+
|
|
314
|
+
// Long 2
|
|
315
|
+
const W t_weight = _total_weight ;
|
|
316
|
+
write(os, t_weight) ;
|
|
317
|
+
|
|
318
|
+
// Long 3 onwards: remaining bytes are consumed by writing the weight and the array values.
|
|
319
|
+
auto it = _sketch_array.begin() ;
|
|
320
|
+
while(it != _sketch_array.end()){
|
|
321
|
+
write(os, *it) ;
|
|
322
|
+
++it ;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
template<typename W, typename A>
|
|
327
|
+
auto count_min_sketch<W,A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) -> count_min_sketch {
|
|
328
|
+
|
|
329
|
+
// First 8 bytes are 4 bytes of preamble and 4 unused bytes.
|
|
330
|
+
const auto preamble_longs = read<uint8_t>(is) ;
|
|
331
|
+
const auto serial_version = read<uint8_t>(is) ;
|
|
332
|
+
const auto family_id = read<uint8_t>(is) ;
|
|
333
|
+
const auto flags_byte = read<uint8_t>(is) ;
|
|
334
|
+
read<uint32_t>(is) ; // 4 unused bytes
|
|
335
|
+
|
|
336
|
+
check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
|
|
337
|
+
|
|
338
|
+
// Sketch parameters
|
|
339
|
+
const auto nbuckets = read<uint32_t>(is) ;
|
|
340
|
+
const auto nhashes = read<uint8_t>(is);
|
|
341
|
+
const auto seed_hash = read<uint16_t>(is) ;
|
|
342
|
+
read<uint8_t>(is) ; // 1 unused byte
|
|
343
|
+
|
|
344
|
+
if (seed_hash != compute_seed_hash(seed)) {
|
|
345
|
+
throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
|
|
346
|
+
+ std::to_string(compute_seed_hash(seed)));
|
|
347
|
+
}
|
|
348
|
+
count_min_sketch c(nhashes, nbuckets, seed, allocator) ;
|
|
349
|
+
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
|
350
|
+
if (is_empty == 1) return c ; // sketch is empty, no need to read further.
|
|
351
|
+
|
|
352
|
+
// Set the sketch weight and read in the sketch values
|
|
353
|
+
const auto weight = read<W>(is) ;
|
|
354
|
+
c._total_weight += weight ;
|
|
355
|
+
read(is, c._sketch_array.data(), sizeof(W) * c._sketch_array.size());
|
|
356
|
+
|
|
357
|
+
return c ;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
template<typename W, typename A>
|
|
361
|
+
size_t count_min_sketch<W,A>::get_serialized_size_bytes() const {
|
|
362
|
+
// The header is always 2 longs, whether empty or full
|
|
363
|
+
size_t preamble_longs = PREAMBLE_LONGS_SHORT;
|
|
364
|
+
|
|
365
|
+
// If the sketch is empty, we're done. Otherwise, we need the total weight
|
|
366
|
+
// held by the sketch as well as a data table of size (num_buckets * num_hashes)
|
|
367
|
+
return (preamble_longs * sizeof(uint64_t)) + (is_empty() ? 0 : sizeof(W) * (1 + _num_buckets * _num_hashes));
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
template<typename W, typename A>
|
|
371
|
+
auto count_min_sketch<W,A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
|
372
|
+
vector_bytes bytes(header_size_bytes + get_serialized_size_bytes(), 0, _allocator);
|
|
373
|
+
uint8_t *ptr = bytes.data() + header_size_bytes;
|
|
374
|
+
|
|
375
|
+
// Long 0
|
|
376
|
+
const uint8_t preamble_longs = PREAMBLE_LONGS_SHORT;
|
|
377
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
|
378
|
+
const uint8_t ser_ver = SERIAL_VERSION_1;
|
|
379
|
+
ptr += copy_to_mem(ser_ver, ptr);
|
|
380
|
+
const uint8_t family_id = FAMILY_ID ;
|
|
381
|
+
ptr += copy_to_mem(family_id, ptr);
|
|
382
|
+
const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
|
|
383
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
384
|
+
const uint32_t unused32 = NULL_32 ;
|
|
385
|
+
ptr += copy_to_mem(unused32, ptr) ;
|
|
386
|
+
|
|
387
|
+
// Long 1
|
|
388
|
+
const uint32_t nbuckets = _num_buckets ;
|
|
389
|
+
const uint8_t nhashes = _num_hashes ;
|
|
390
|
+
const uint16_t seed_hash(compute_seed_hash(_seed));
|
|
391
|
+
const uint8_t null_characters_8 = NULL_8;
|
|
392
|
+
ptr += copy_to_mem(nbuckets, ptr) ;
|
|
393
|
+
ptr += copy_to_mem(nhashes, ptr) ;
|
|
394
|
+
ptr += copy_to_mem(seed_hash, ptr) ;
|
|
395
|
+
ptr += copy_to_mem(null_characters_8, ptr) ;
|
|
396
|
+
if (is_empty()) return bytes ; // sketch is empty, no need to write further bytes.
|
|
397
|
+
|
|
398
|
+
// Long 2
|
|
399
|
+
const W t_weight = _total_weight ;
|
|
400
|
+
ptr += copy_to_mem(t_weight, ptr) ;
|
|
401
|
+
|
|
402
|
+
// Long 3 onwards: remaining bytes are consumed by writing the weight and the array values.
|
|
403
|
+
auto it = _sketch_array.begin() ;
|
|
404
|
+
while(it != _sketch_array.end()){
|
|
405
|
+
ptr += copy_to_mem(*it, ptr) ;
|
|
406
|
+
++it ;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return bytes;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
template<typename W, typename A>
|
|
413
|
+
auto count_min_sketch<W,A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) -> count_min_sketch {
|
|
414
|
+
ensure_minimum_memory(size, PREAMBLE_LONGS_SHORT * sizeof(uint64_t));
|
|
415
|
+
|
|
416
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
417
|
+
|
|
418
|
+
// First 8 bytes are 4 bytes of preamble and 4 unused bytes.
|
|
419
|
+
uint8_t preamble_longs ;
|
|
420
|
+
ptr += copy_from_mem(ptr, preamble_longs) ;
|
|
421
|
+
uint8_t serial_version ;
|
|
422
|
+
ptr += copy_from_mem(ptr, serial_version) ;
|
|
423
|
+
uint8_t family_id ;
|
|
424
|
+
ptr += copy_from_mem(ptr, family_id) ;
|
|
425
|
+
uint8_t flags_byte ;
|
|
426
|
+
ptr += copy_from_mem(ptr, flags_byte) ;
|
|
427
|
+
ptr += sizeof(uint32_t);
|
|
428
|
+
|
|
429
|
+
check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
|
|
430
|
+
|
|
431
|
+
// Second 8 bytes are the sketch parameters with a final, unused byte.
|
|
432
|
+
uint32_t nbuckets ;
|
|
433
|
+
uint8_t nhashes ;
|
|
434
|
+
uint16_t seed_hash ;
|
|
435
|
+
ptr += copy_from_mem(ptr, nbuckets) ;
|
|
436
|
+
ptr += copy_from_mem(ptr, nhashes) ;
|
|
437
|
+
ptr += copy_from_mem(ptr, seed_hash) ;
|
|
438
|
+
ptr += sizeof(uint8_t);
|
|
439
|
+
|
|
440
|
+
if (seed_hash != compute_seed_hash(seed)) {
|
|
441
|
+
throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
|
|
442
|
+
+ std::to_string(compute_seed_hash(seed)));
|
|
443
|
+
}
|
|
444
|
+
count_min_sketch c(nhashes, nbuckets, seed, allocator) ;
|
|
445
|
+
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
|
446
|
+
if (is_empty) return c ; // sketch is empty, no need to read further.
|
|
447
|
+
|
|
448
|
+
ensure_minimum_memory(size, sizeof(W) * (1 + nbuckets * nhashes));
|
|
449
|
+
|
|
450
|
+
// Long 2 is the weight.
|
|
451
|
+
W weight;
|
|
452
|
+
ptr += copy_from_mem(ptr, weight) ;
|
|
453
|
+
c._total_weight += weight ;
|
|
454
|
+
|
|
455
|
+
// All remaining bytes are the sketch table entries.
|
|
456
|
+
for (size_t i = 0; i<c._num_buckets*c._num_hashes ; ++i){
|
|
457
|
+
ptr += copy_from_mem(ptr, c._sketch_array[i]) ;
|
|
458
|
+
}
|
|
459
|
+
return c;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
template<typename W, typename A>
|
|
463
|
+
bool count_min_sketch<W,A>::is_empty() const {
|
|
464
|
+
return _total_weight == 0;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
template<typename W, typename A>
|
|
468
|
+
string<A> count_min_sketch<W,A>::to_string() const {
|
|
469
|
+
// count the number of used entries in the sketch
|
|
470
|
+
uint64_t num_nonzero = 0;
|
|
471
|
+
for (auto entry : _sketch_array) {
|
|
472
|
+
if (entry != static_cast<W>(0.0))
|
|
473
|
+
++num_nonzero;
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
|
477
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
|
478
|
+
std::ostringstream os;
|
|
479
|
+
os << "### Count Min sketch summary:" << std::endl;
|
|
480
|
+
os << " num hashes : " << static_cast<uint32_t>(_num_hashes) << std::endl;
|
|
481
|
+
os << " num buckets : " << _num_buckets << std::endl;
|
|
482
|
+
os << " capacity bins : " << _sketch_array.size() << std::endl;
|
|
483
|
+
os << " filled bins : " << num_nonzero << std::endl;
|
|
484
|
+
os << " pct filled : " << std::setprecision(3) << (num_nonzero * 100.0) / _sketch_array.size() << "%" << std::endl;
|
|
485
|
+
os << "### End sketch summary" << std::endl;
|
|
486
|
+
|
|
487
|
+
return string<A>(os.str().c_str(), _allocator);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
template<typename W, typename A>
|
|
491
|
+
void count_min_sketch<W,A>::check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte) {
|
|
492
|
+
const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
|
493
|
+
|
|
494
|
+
const uint8_t sw = (empty ? 1 : 0) + (2 * serial_version) + (4 * family_id) + (32 * (preamble_longs & 0x3F));
|
|
495
|
+
bool valid = true;
|
|
496
|
+
|
|
497
|
+
switch (sw) { // exhaustive list and description of all valid cases
|
|
498
|
+
case 138 : break; // !empty, ser_ver==1, family==18, preLongs=2;
|
|
499
|
+
case 139 : break; // empty, ser_ver==1, family==18, preLongs=2;
|
|
500
|
+
//case 170 : break ; // !empty, ser_ver==1, family==18, preLongs=3 ;
|
|
501
|
+
default : // all other case values are invalid
|
|
502
|
+
valid = false;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
if (!valid) {
|
|
506
|
+
std::ostringstream os;
|
|
507
|
+
os << "Possible sketch corruption. Inconsistent state: "
|
|
508
|
+
<< "preamble_longs = " << static_cast<uint32_t>(preamble_longs)
|
|
509
|
+
<< ", empty = " << (empty ? "true" : "false")
|
|
510
|
+
<< ", serialization_version = " << static_cast<uint32_t>(serial_version) ;
|
|
511
|
+
throw std::invalid_argument(os.str());
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
} /* namespace datasketches */
|
|
516
|
+
|
|
517
|
+
#endif
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
add_executable(count_min_test)
|
|
19
|
+
|
|
20
|
+
target_link_libraries(count_min_test count common_test_lib)
|
|
21
|
+
|
|
22
|
+
set_target_properties(count_min_test PROPERTIES
|
|
23
|
+
CXX_STANDARD 11
|
|
24
|
+
CXX_STANDARD_REQUIRED YES
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" COUNT_TEST_BINARY_PATH)
|
|
28
|
+
string(APPEND COUNT_TEST_BINARY_PATH "/")
|
|
29
|
+
target_compile_definitions(count_min_test
|
|
30
|
+
PRIVATE
|
|
31
|
+
TEST_BINARY_INPUT_PATH="${COUNT_TEST_BINARY_PATH}"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
add_test(
|
|
35
|
+
NAME count_min_test
|
|
36
|
+
COMMAND count_min_test
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
target_sources(count_min_test
|
|
40
|
+
PRIVATE
|
|
41
|
+
count_min_test.cpp
|
|
42
|
+
count_min_allocation_test.cpp
|
|
43
|
+
)
|