datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,517 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef COUNT_MIN_IMPL_HPP_
21
+ #define COUNT_MIN_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+ #include <iomanip>
25
+ #include <random>
26
+ #include <sstream>
27
+
28
+ #include "MurmurHash3.h"
29
+ #include "count_min.hpp"
30
+ #include "memory_operations.hpp"
31
+
32
+ namespace datasketches {
33
+
34
+ template<typename W, typename A>
35
+ count_min_sketch<W,A>::count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed, const A& allocator):
36
+ _allocator(allocator),
37
+ _num_hashes(num_hashes),
38
+ _num_buckets(num_buckets),
39
+ _sketch_array((num_hashes*num_buckets < 1<<30) ? num_hashes*num_buckets : 0, 0, _allocator),
40
+ _seed(seed),
41
+ _total_weight(0){
42
+ if(num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.") ;
43
+
44
+ // This check is to ensure later compatibility with a Java implementation whose maximum size can only
45
+ // be 2^31-1. We check only against 2^30 for simplicity.
46
+ if(num_buckets*num_hashes >= 1<<30) {
47
+ throw std::invalid_argument("These parameters generate a sketch that exceeds 2^30 elements."
48
+ "Try reducing either the number of buckets or the number of hash functions.") ;
49
+ }
50
+
51
+ std::default_random_engine rng(_seed);
52
+ std::uniform_int_distribution<uint64_t> extra_hash_seeds(0, std::numeric_limits<uint64_t>::max());
53
+ hash_seeds.reserve(num_hashes) ;
54
+
55
+ for(uint64_t i=0 ; i < num_hashes ; ++i){
56
+ hash_seeds.push_back(extra_hash_seeds(rng) + _seed); // Adds the global seed to all hash functions.
57
+ }
58
+ }
59
+
60
+ template<typename W, typename A>
61
+ uint8_t count_min_sketch<W,A>::get_num_hashes() const{
62
+ return _num_hashes ;
63
+ }
64
+
65
+ template<typename W, typename A>
66
+ uint32_t count_min_sketch<W,A>::get_num_buckets() const{
67
+ return _num_buckets ;
68
+ }
69
+
70
+ template<typename W, typename A>
71
+ uint64_t count_min_sketch<W,A>::get_seed() const {
72
+ return _seed ;
73
+ }
74
+
75
+ template<typename W, typename A>
76
+ double count_min_sketch<W,A>::get_relative_error() const{
77
+ return exp(1.0) / double(_num_buckets) ;
78
+ }
79
+
80
+ template<typename W, typename A>
81
+ W count_min_sketch<W,A>::get_total_weight() const{
82
+ return _total_weight ;
83
+ }
84
+
85
+ template<typename W, typename A>
86
+ uint32_t count_min_sketch<W,A>::suggest_num_buckets(double relative_error){
87
+ /*
88
+ * Function to help users select a number of buckets for a given error.
89
+ * TODO: Change this when we use only power of 2 buckets.
90
+ *
91
+ */
92
+ if(relative_error < 0.){
93
+ throw std::invalid_argument( "Relative error must be at least 0." );
94
+ }
95
+ return ceil(exp(1.0) / relative_error);
96
+ }
97
+
98
+ template<typename W, typename A>
99
+ uint8_t count_min_sketch<W,A>::suggest_num_hashes(double confidence){
100
+ /*
101
+ * Function to help users select a number of hashes for a given confidence
102
+ * e.g. confidence = 1 - failure probability
103
+ * failure probability == delta in the literature.
104
+ */
105
+ if(confidence < 0. || confidence > 1.0){
106
+ throw std::invalid_argument( "Confidence must be between 0 and 1.0 (inclusive)." );
107
+ }
108
+ return std::min<uint8_t>( ceil(log(1.0/(1.0 - confidence))), UINT8_MAX) ;
109
+ }
110
+
111
+ template<typename W, typename A>
112
+ std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t size) const{
113
+ /*
114
+ * Returns the hash locations for the input item using the original hashing
115
+ * scheme from [1].
116
+ * Generate _num_hashes separate hashes from calls to murmurmhash.
117
+ * This could be optimized by keeping both of the 64bit parts of the hash
118
+ * function, rather than generating a new one for every level.
119
+ *
120
+ *
121
+ * Postscript.
122
+ * Note that a tradeoff can be achieved over the update time and space
123
+ * complexity of the sketch by using a combinatorial hashing scheme from
124
+ * https://github.com/Claudenw/BloomFilter/wiki/Bloom-Filters----An-overview
125
+ * https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
126
+ */
127
+ uint64_t bucket_index ;
128
+ std::vector<uint64_t> sketch_update_locations; //(_num_hashes) ;
129
+ sketch_update_locations.reserve(_num_hashes) ;
130
+
131
+ uint64_t hash_seed_index = 0 ;
132
+ for(const auto &it : hash_seeds){
133
+ HashState hashes;
134
+ MurmurHash3_x64_128(item, size, it, hashes); // ? BEWARE OVERFLOW.
135
+ uint64_t hash = hashes.h1 ;
136
+ bucket_index = hash % _num_buckets ;
137
+ sketch_update_locations.push_back((hash_seed_index * _num_buckets) + bucket_index) ;
138
+ hash_seed_index += 1 ;
139
+ }
140
+ return sketch_update_locations ;
141
+ }
142
+
143
+ template<typename W, typename A>
144
+ W count_min_sketch<W,A>::get_estimate(uint64_t item) const {return get_estimate(&item, sizeof(item));}
145
+
146
+ template<typename W, typename A>
147
+ W count_min_sketch<W,A>::get_estimate(int64_t item) const {return get_estimate(&item, sizeof(item));}
148
+
149
+ template<typename W, typename A>
150
+ W count_min_sketch<W,A>::get_estimate(const std::string& item) const {
151
+ if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch.
152
+ return get_estimate(item.c_str(), item.length());
153
+ }
154
+
155
+ template<typename W, typename A>
156
+ W count_min_sketch<W,A>::get_estimate(const void* item, size_t size) const {
157
+ /*
158
+ * Returns the estimated frequency of the item
159
+ */
160
+ std::vector<uint64_t> hash_locations = get_hashes(item, size) ;
161
+ std::vector<W> estimates ;
162
+ for (auto h: hash_locations){
163
+ estimates.push_back(_sketch_array[h]) ;
164
+ }
165
+ W result = *std::min_element(estimates.begin(), estimates.end());
166
+ return result ;
167
+ }
168
+
169
+ template<typename W, typename A>
170
+ void count_min_sketch<W,A>::update(uint64_t item, W weight) {
171
+ update(&item, sizeof(item), weight);
172
+ }
173
+
174
+ template<typename W, typename A>
175
+ void count_min_sketch<W,A>::update(uint64_t item) {
176
+ update(&item, sizeof(item), 1);
177
+ }
178
+
179
+ template<typename W, typename A>
180
+ void count_min_sketch<W,A>::update(int64_t item, W weight) {
181
+ update(&item, sizeof(item), weight);
182
+ }
183
+
184
+ template<typename W, typename A>
185
+ void count_min_sketch<W,A>::update(int64_t item) {
186
+ update(&item, sizeof(item), 1);
187
+ }
188
+
189
+ template<typename W, typename A>
190
+ void count_min_sketch<W,A>::update(const std::string& item, W weight) {
191
+ if (item.empty()) return;
192
+ update(item.c_str(), item.length(), weight);
193
+ }
194
+
195
+ template<typename W, typename A>
196
+ void count_min_sketch<W,A>::update(const std::string& item) {
197
+ if (item.empty()) return;
198
+ update(item.c_str(), item.length(), 1);
199
+ }
200
+
201
+ template<typename W, typename A>
202
+ void count_min_sketch<W,A>::update(const void* item, size_t size, W weight) {
203
+ /*
204
+ * Gets the item's hash locations and then increments the sketch in those
205
+ * locations by the weight.
206
+ */
207
+ W magnitude = (weight >= 0) ? weight : -weight ;
208
+ _total_weight += magnitude ;
209
+ std::vector<uint64_t> hash_locations = get_hashes(item, size) ;
210
+ for (auto h: hash_locations){
211
+ _sketch_array[h] += weight ;
212
+ }
213
+ }
214
+
215
+ template<typename W, typename A>
216
+ W count_min_sketch<W,A>::get_upper_bound(uint64_t item) const {return get_upper_bound(&item, sizeof(item));}
217
+
218
+ template<typename W, typename A>
219
+ W count_min_sketch<W,A>::get_upper_bound(int64_t item) const {return get_upper_bound(&item, sizeof(item));}
220
+
221
+ template<typename W, typename A>
222
+ W count_min_sketch<W,A>::get_upper_bound(const std::string& item) const {
223
+ if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch.
224
+ return get_upper_bound(item.c_str(), item.length());
225
+ }
226
+
227
+ template<typename W, typename A>
228
+ W count_min_sketch<W,A>::get_upper_bound(const void* item, size_t size) const {
229
+ return get_estimate(item, size) + get_relative_error()*get_total_weight() ;
230
+ }
231
+
232
+ template<typename W, typename A>
233
+ W count_min_sketch<W,A>::get_lower_bound(uint64_t item) const {return get_lower_bound(&item, sizeof(item));}
234
+
235
+ template<typename W, typename A>
236
+ W count_min_sketch<W,A>::get_lower_bound(int64_t item) const {return get_lower_bound(&item, sizeof(item));}
237
+
238
+ template<typename W, typename A>
239
+ W count_min_sketch<W,A>::get_lower_bound(const std::string& item) const {
240
+ if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch.
241
+ return get_lower_bound(item.c_str(), item.length());
242
+ }
243
+
244
+ template<typename W, typename A>
245
+ W count_min_sketch<W,A>::get_lower_bound(const void* item, size_t size) const {
246
+ return get_estimate(item, size) ;
247
+ }
248
+
249
+ template<typename W, typename A>
250
+ void count_min_sketch<W,A>::merge(const count_min_sketch &other_sketch){
251
+ /*
252
+ * Merges this sketch into other_sketch sketch by elementwise summing of buckets
253
+ */
254
+ if(this == &other_sketch){
255
+ throw std::invalid_argument( "Cannot merge a sketch with itself." );
256
+ }
257
+
258
+ bool acceptable_config =
259
+ (get_num_hashes() == other_sketch.get_num_hashes()) &&
260
+ (get_num_buckets() == other_sketch.get_num_buckets()) &&
261
+ (get_seed() == other_sketch.get_seed()) ;
262
+ if(!acceptable_config){
263
+ throw std::invalid_argument( "Incompatible sketch configuration." );
264
+ }
265
+
266
+ // Merge step - iterate over the other vector and add the weights to this sketch
267
+ auto it = _sketch_array.begin() ; // This is a std::vector iterator.
268
+ auto other_it = other_sketch.begin() ; //This is a const iterator over the other sketch.
269
+ while(it != _sketch_array.end()){
270
+ *it += *other_it ;
271
+ ++it ;
272
+ ++other_it ;
273
+ }
274
+ _total_weight += other_sketch.get_total_weight() ;
275
+ }
276
+
277
+ // Iterators
278
+ template<typename W, typename A>
279
+ typename count_min_sketch<W,A>::const_iterator count_min_sketch<W,A>::begin() const {
280
+ return _sketch_array.begin();
281
+ }
282
+
283
+ template<typename W, typename A>
284
+ typename count_min_sketch<W,A>::const_iterator count_min_sketch<W,A>::end() const {
285
+ return _sketch_array.end();
286
+ }
287
+
288
+ template<typename W, typename A>
289
+ void count_min_sketch<W,A>::serialize(std::ostream& os) const {
290
+ // Long 0
291
+ //const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
292
+ const uint8_t preamble_longs = PREAMBLE_LONGS_SHORT;
293
+ const uint8_t ser_ver = SERIAL_VERSION_1;
294
+ const uint8_t family_id = FAMILY_ID ;
295
+ const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
296
+ const uint32_t unused32 = NULL_32 ;
297
+ write(os, preamble_longs) ;
298
+ write(os, ser_ver) ;
299
+ write(os, family_id) ;
300
+ write(os, flags_byte) ;
301
+ write(os, unused32) ;
302
+
303
+ // Long 1
304
+ const uint32_t nbuckets = _num_buckets ;
305
+ const uint8_t nhashes = _num_hashes ;
306
+ const uint16_t seed_hash(compute_seed_hash(_seed));
307
+ const uint8_t unused8 = NULL_8;
308
+ write(os, nbuckets) ;
309
+ write(os, nhashes) ;
310
+ write(os, seed_hash) ;
311
+ write(os, unused8) ;
312
+ if (is_empty()) return ; // sketch is empty, no need to write further bytes.
313
+
314
+ // Long 2
315
+ const W t_weight = _total_weight ;
316
+ write(os, t_weight) ;
317
+
318
+ // Long 3 onwards: remaining bytes are consumed by writing the weight and the array values.
319
+ auto it = _sketch_array.begin() ;
320
+ while(it != _sketch_array.end()){
321
+ write(os, *it) ;
322
+ ++it ;
323
+ }
324
+ }
325
+
326
+ template<typename W, typename A>
327
+ auto count_min_sketch<W,A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) -> count_min_sketch {
328
+
329
+ // First 8 bytes are 4 bytes of preamble and 4 unused bytes.
330
+ const auto preamble_longs = read<uint8_t>(is) ;
331
+ const auto serial_version = read<uint8_t>(is) ;
332
+ const auto family_id = read<uint8_t>(is) ;
333
+ const auto flags_byte = read<uint8_t>(is) ;
334
+ read<uint32_t>(is) ; // 4 unused bytes
335
+
336
+ check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
337
+
338
+ // Sketch parameters
339
+ const auto nbuckets = read<uint32_t>(is) ;
340
+ const auto nhashes = read<uint8_t>(is);
341
+ const auto seed_hash = read<uint16_t>(is) ;
342
+ read<uint8_t>(is) ; // 1 unused byte
343
+
344
+ if (seed_hash != compute_seed_hash(seed)) {
345
+ throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
346
+ + std::to_string(compute_seed_hash(seed)));
347
+ }
348
+ count_min_sketch c(nhashes, nbuckets, seed, allocator) ;
349
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
350
+ if (is_empty == 1) return c ; // sketch is empty, no need to read further.
351
+
352
+ // Set the sketch weight and read in the sketch values
353
+ const auto weight = read<W>(is) ;
354
+ c._total_weight += weight ;
355
+ read(is, c._sketch_array.data(), sizeof(W) * c._sketch_array.size());
356
+
357
+ return c ;
358
+ }
359
+
360
+ template<typename W, typename A>
361
+ size_t count_min_sketch<W,A>::get_serialized_size_bytes() const {
362
+ // The header is always 2 longs, whether empty or full
363
+ size_t preamble_longs = PREAMBLE_LONGS_SHORT;
364
+
365
+ // If the sketch is empty, we're done. Otherwise, we need the total weight
366
+ // held by the sketch as well as a data table of size (num_buckets * num_hashes)
367
+ return (preamble_longs * sizeof(uint64_t)) + (is_empty() ? 0 : sizeof(W) * (1 + _num_buckets * _num_hashes));
368
+ }
369
+
370
+ template<typename W, typename A>
371
+ auto count_min_sketch<W,A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
372
+ vector_bytes bytes(header_size_bytes + get_serialized_size_bytes(), 0, _allocator);
373
+ uint8_t *ptr = bytes.data() + header_size_bytes;
374
+
375
+ // Long 0
376
+ const uint8_t preamble_longs = PREAMBLE_LONGS_SHORT;
377
+ ptr += copy_to_mem(preamble_longs, ptr);
378
+ const uint8_t ser_ver = SERIAL_VERSION_1;
379
+ ptr += copy_to_mem(ser_ver, ptr);
380
+ const uint8_t family_id = FAMILY_ID ;
381
+ ptr += copy_to_mem(family_id, ptr);
382
+ const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
383
+ ptr += copy_to_mem(flags_byte, ptr);
384
+ const uint32_t unused32 = NULL_32 ;
385
+ ptr += copy_to_mem(unused32, ptr) ;
386
+
387
+ // Long 1
388
+ const uint32_t nbuckets = _num_buckets ;
389
+ const uint8_t nhashes = _num_hashes ;
390
+ const uint16_t seed_hash(compute_seed_hash(_seed));
391
+ const uint8_t null_characters_8 = NULL_8;
392
+ ptr += copy_to_mem(nbuckets, ptr) ;
393
+ ptr += copy_to_mem(nhashes, ptr) ;
394
+ ptr += copy_to_mem(seed_hash, ptr) ;
395
+ ptr += copy_to_mem(null_characters_8, ptr) ;
396
+ if (is_empty()) return bytes ; // sketch is empty, no need to write further bytes.
397
+
398
+ // Long 2
399
+ const W t_weight = _total_weight ;
400
+ ptr += copy_to_mem(t_weight, ptr) ;
401
+
402
+ // Long 3 onwards: remaining bytes are consumed by writing the weight and the array values.
403
+ auto it = _sketch_array.begin() ;
404
+ while(it != _sketch_array.end()){
405
+ ptr += copy_to_mem(*it, ptr) ;
406
+ ++it ;
407
+ }
408
+
409
+ return bytes;
410
+ }
411
+
412
+ template<typename W, typename A>
413
+ auto count_min_sketch<W,A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) -> count_min_sketch {
414
+ ensure_minimum_memory(size, PREAMBLE_LONGS_SHORT * sizeof(uint64_t));
415
+
416
+ const char* ptr = static_cast<const char*>(bytes);
417
+
418
+ // First 8 bytes are 4 bytes of preamble and 4 unused bytes.
419
+ uint8_t preamble_longs ;
420
+ ptr += copy_from_mem(ptr, preamble_longs) ;
421
+ uint8_t serial_version ;
422
+ ptr += copy_from_mem(ptr, serial_version) ;
423
+ uint8_t family_id ;
424
+ ptr += copy_from_mem(ptr, family_id) ;
425
+ uint8_t flags_byte ;
426
+ ptr += copy_from_mem(ptr, flags_byte) ;
427
+ ptr += sizeof(uint32_t);
428
+
429
+ check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
430
+
431
+ // Second 8 bytes are the sketch parameters with a final, unused byte.
432
+ uint32_t nbuckets ;
433
+ uint8_t nhashes ;
434
+ uint16_t seed_hash ;
435
+ ptr += copy_from_mem(ptr, nbuckets) ;
436
+ ptr += copy_from_mem(ptr, nhashes) ;
437
+ ptr += copy_from_mem(ptr, seed_hash) ;
438
+ ptr += sizeof(uint8_t);
439
+
440
+ if (seed_hash != compute_seed_hash(seed)) {
441
+ throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
442
+ + std::to_string(compute_seed_hash(seed)));
443
+ }
444
+ count_min_sketch c(nhashes, nbuckets, seed, allocator) ;
445
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
446
+ if (is_empty) return c ; // sketch is empty, no need to read further.
447
+
448
+ ensure_minimum_memory(size, sizeof(W) * (1 + nbuckets * nhashes));
449
+
450
+ // Long 2 is the weight.
451
+ W weight;
452
+ ptr += copy_from_mem(ptr, weight) ;
453
+ c._total_weight += weight ;
454
+
455
+ // All remaining bytes are the sketch table entries.
456
+ for (size_t i = 0; i<c._num_buckets*c._num_hashes ; ++i){
457
+ ptr += copy_from_mem(ptr, c._sketch_array[i]) ;
458
+ }
459
+ return c;
460
+ }
461
+
462
+ template<typename W, typename A>
463
+ bool count_min_sketch<W,A>::is_empty() const {
464
+ return _total_weight == 0;
465
+ }
466
+
467
+ template<typename W, typename A>
468
+ string<A> count_min_sketch<W,A>::to_string() const {
469
+ // count the number of used entries in the sketch
470
+ uint64_t num_nonzero = 0;
471
+ for (auto entry : _sketch_array) {
472
+ if (entry != static_cast<W>(0.0))
473
+ ++num_nonzero;
474
+ }
475
+
476
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
477
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
478
+ std::ostringstream os;
479
+ os << "### Count Min sketch summary:" << std::endl;
480
+ os << " num hashes : " << static_cast<uint32_t>(_num_hashes) << std::endl;
481
+ os << " num buckets : " << _num_buckets << std::endl;
482
+ os << " capacity bins : " << _sketch_array.size() << std::endl;
483
+ os << " filled bins : " << num_nonzero << std::endl;
484
+ os << " pct filled : " << std::setprecision(3) << (num_nonzero * 100.0) / _sketch_array.size() << "%" << std::endl;
485
+ os << "### End sketch summary" << std::endl;
486
+
487
+ return string<A>(os.str().c_str(), _allocator);
488
+ }
489
+
490
+ template<typename W, typename A>
491
+ void count_min_sketch<W,A>::check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte) {
492
+ const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
493
+
494
+ const uint8_t sw = (empty ? 1 : 0) + (2 * serial_version) + (4 * family_id) + (32 * (preamble_longs & 0x3F));
495
+ bool valid = true;
496
+
497
+ switch (sw) { // exhaustive list and description of all valid cases
498
+ case 138 : break; // !empty, ser_ver==1, family==18, preLongs=2;
499
+ case 139 : break; // empty, ser_ver==1, family==18, preLongs=2;
500
+ //case 170 : break ; // !empty, ser_ver==1, family==18, preLongs=3 ;
501
+ default : // all other case values are invalid
502
+ valid = false;
503
+ }
504
+
505
+ if (!valid) {
506
+ std::ostringstream os;
507
+ os << "Possible sketch corruption. Inconsistent state: "
508
+ << "preamble_longs = " << static_cast<uint32_t>(preamble_longs)
509
+ << ", empty = " << (empty ? "true" : "false")
510
+ << ", serialization_version = " << static_cast<uint32_t>(serial_version) ;
511
+ throw std::invalid_argument(os.str());
512
+ }
513
+ }
514
+
515
+ } /* namespace datasketches */
516
+
517
+ #endif
@@ -0,0 +1,43 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_executable(count_min_test)
19
+
20
+ target_link_libraries(count_min_test count common_test_lib)
21
+
22
+ set_target_properties(count_min_test PROPERTIES
23
+ CXX_STANDARD 11
24
+ CXX_STANDARD_REQUIRED YES
25
+ )
26
+
27
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" COUNT_TEST_BINARY_PATH)
28
+ string(APPEND COUNT_TEST_BINARY_PATH "/")
29
+ target_compile_definitions(count_min_test
30
+ PRIVATE
31
+ TEST_BINARY_INPUT_PATH="${COUNT_TEST_BINARY_PATH}"
32
+ )
33
+
34
+ add_test(
35
+ NAME count_min_test
36
+ COMMAND count_min_test
37
+ )
38
+
39
+ target_sources(count_min_test
40
+ PRIVATE
41
+ count_min_test.cpp
42
+ count_min_allocation_test.cpp
43
+ )