datasketches 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
@@ -191,8 +191,8 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
|
|
191
191
|
|
192
192
|
template<typename A>
|
193
193
|
cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
|
194
|
-
const
|
195
|
-
const
|
194
|
+
const uint32_t k = 1 << lg_k;
|
195
|
+
const uint32_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
|
196
196
|
|
197
197
|
const auto flavor = cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons);
|
198
198
|
if (flavor != cpc_sketch_alloc<A>::flavor::HYBRID && flavor != cpc_sketch_alloc<A>::flavor::PINNED
|
@@ -215,7 +215,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
|
|
215
215
|
|
216
216
|
// The snowplow effect was caused by processing the rows in order,
|
217
217
|
// but we have fixed it by using a sufficiently large hash table.
|
218
|
-
for (
|
218
|
+
for (uint32_t i = 0; i < k; i++) {
|
219
219
|
uint64_t pattern = bit_matrix[i];
|
220
220
|
sliding_window[i] = (pattern >> offset) & 0xff;
|
221
221
|
pattern &= mask_for_clearing_window;
|
@@ -250,17 +250,17 @@ void cpc_union_alloc<A>::switch_to_bit_matrix() {
|
|
250
250
|
template<typename A>
|
251
251
|
void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
|
252
252
|
const uint32_t* slots = table.get_slots();
|
253
|
-
const
|
253
|
+
const uint32_t num_slots = 1 << table.get_lg_size();
|
254
254
|
const uint64_t dst_mask = (((1 << accumulator->get_lg_k()) - 1) << 6) | 63; // downsamples when dst lgK < src LgK
|
255
255
|
|
256
256
|
// Using a golden ratio stride fixes the snowplow effect.
|
257
257
|
const double golden = 0.6180339887498949025;
|
258
|
-
|
258
|
+
uint32_t stride = static_cast<uint32_t>(golden * static_cast<double>(num_slots));
|
259
259
|
if (stride < 2) throw std::logic_error("stride < 2");
|
260
260
|
if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd
|
261
261
|
if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range");
|
262
262
|
|
263
|
-
for (
|
263
|
+
for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) {
|
264
264
|
j &= num_slots - 1;
|
265
265
|
const uint32_t row_col = slots[j];
|
266
266
|
if (row_col != UINT32_MAX) {
|
@@ -272,13 +272,13 @@ void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
|
|
272
272
|
template<typename A>
|
273
273
|
void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
|
274
274
|
const uint32_t* slots = table.get_slots();
|
275
|
-
const
|
275
|
+
const uint32_t num_slots = 1 << table.get_lg_size();
|
276
276
|
const uint64_t dest_mask = (1 << lg_k) - 1; // downsamples when dst lgK < sr LgK
|
277
|
-
for (
|
277
|
+
for (uint32_t i = 0; i < num_slots; i++) {
|
278
278
|
const uint32_t row_col = slots[i];
|
279
279
|
if (row_col != UINT32_MAX) {
|
280
280
|
const uint8_t col = row_col & 63;
|
281
|
-
const
|
281
|
+
const uint32_t row = row_col >> 6;
|
282
282
|
bit_matrix[row & dest_mask] |= static_cast<uint64_t>(1) << col; // set the bit
|
283
283
|
}
|
284
284
|
}
|
@@ -288,8 +288,8 @@ template<typename A>
|
|
288
288
|
void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k) {
|
289
289
|
if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
|
290
290
|
const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
|
291
|
-
const
|
292
|
-
for (
|
291
|
+
const uint32_t src_k = 1 << src_lg_k;
|
292
|
+
for (uint32_t src_row = 0; src_row < src_k; src_row++) {
|
293
293
|
bit_matrix[src_row & dst_mask] |= static_cast<uint64_t>(sliding_window[src_row]) << offset;
|
294
294
|
}
|
295
295
|
}
|
@@ -298,8 +298,8 @@ template<typename A>
|
|
298
298
|
void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k) {
|
299
299
|
if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
|
300
300
|
const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
|
301
|
-
const
|
302
|
-
for (
|
301
|
+
const uint32_t src_k = 1 << src_lg_k;
|
302
|
+
for (uint32_t src_row = 0; src_row < src_k; src_row++) {
|
303
303
|
bit_matrix[src_row & dst_mask] |= src_matrix[src_row];
|
304
304
|
}
|
305
305
|
}
|
@@ -313,7 +313,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
|
|
313
313
|
if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
|
314
314
|
vector_u64<A> old_matrix = std::move(bit_matrix);
|
315
315
|
const uint8_t old_lg_k = lg_k;
|
316
|
-
const
|
316
|
+
const uint32_t new_k = 1 << new_lg_k;
|
317
317
|
bit_matrix = vector_u64<A>(new_k, 0, old_matrix.get_allocator());
|
318
318
|
lg_k = new_lg_k;
|
319
319
|
or_matrix_into_matrix(old_matrix, old_lg_k);
|
@@ -31,9 +31,9 @@ static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
|
|
31
31
|
else return quotient + 1;
|
32
32
|
}
|
33
33
|
|
34
|
-
static inline
|
35
|
-
if (x < 1) throw std::invalid_argument("
|
36
|
-
|
34
|
+
static inline uint8_t floor_log2_of_long(uint64_t x) {
|
35
|
+
if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument");
|
36
|
+
uint8_t p = 0;
|
37
37
|
uint64_t y = 1;
|
38
38
|
while (true) {
|
39
39
|
if (y == x) return p;
|
@@ -69,7 +69,7 @@ static inline uint64_t wegner_count_bits_set_in_matrix(const uint64_t* array, si
|
|
69
69
|
// Note: this is an adaptation of the Java code,
|
70
70
|
// which is apparently a variation of Figure 5-2 in "Hacker's Delight"
|
71
71
|
// by Henry S. Warren.
|
72
|
-
static inline
|
72
|
+
static inline uint32_t warren_bit_count(uint64_t i) {
|
73
73
|
i = i - ((i >> 1) & 0x5555555555555555ULL);
|
74
74
|
i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
|
75
75
|
i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
|
@@ -79,9 +79,9 @@ static inline uint64_t warren_bit_count(uint64_t i) {
|
|
79
79
|
return i & 0x7f;
|
80
80
|
}
|
81
81
|
|
82
|
-
static inline
|
83
|
-
|
84
|
-
for (
|
82
|
+
static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, uint32_t length) {
|
83
|
+
uint32_t count = 0;
|
84
|
+
for (uint32_t i = 0; i < length; i++) {
|
85
85
|
count += warren_bit_count(array[i]);
|
86
86
|
}
|
87
87
|
return count;
|
@@ -91,13 +91,13 @@ static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, si
|
|
91
91
|
|
92
92
|
#define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
|
93
93
|
|
94
|
-
static inline
|
94
|
+
static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
|
95
95
|
if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
|
96
|
-
|
96
|
+
uint32_t total = 0;
|
97
97
|
uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights;
|
98
98
|
fours = twos = ones = 0;
|
99
99
|
|
100
|
-
for (
|
100
|
+
for (uint32_t i = 0; i <= length - 8; i += 8) {
|
101
101
|
CSA(twos_a, ones, ones, a[i+0], a[i+1]);
|
102
102
|
CSA(twos_b, ones, ones, a[i+2], a[i+3]);
|
103
103
|
CSA(fours_a, twos, twos, twos_a, twos_b);
|
@@ -245,12 +245,12 @@ static inline double icon_exponential_approximation(double k, double c) {
|
|
245
245
|
return (0.7940236163830469 * k * pow(2.0, c / k));
|
246
246
|
}
|
247
247
|
|
248
|
-
static inline double compute_icon_estimate(uint8_t lg_k,
|
248
|
+
static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) {
|
249
249
|
if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
|
250
250
|
if (c < 2) return ((c == 0) ? 0.0 : 1.0);
|
251
|
-
const
|
252
|
-
const double double_k = k;
|
253
|
-
const double double_c = c;
|
251
|
+
const uint32_t k = 1 << lg_k;
|
252
|
+
const double double_k = static_cast<double>(k);
|
253
|
+
const double double_c = static_cast<double>(c);
|
254
254
|
// Differing thresholds ensure that the approximated estimator is monotonically increasing.
|
255
255
|
const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
|
256
256
|
if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
|
@@ -29,11 +29,11 @@
|
|
29
29
|
|
30
30
|
namespace datasketches {
|
31
31
|
|
32
|
-
static const
|
33
|
-
static const
|
32
|
+
static const uint32_t U32_TABLE_UPSIZE_NUMER = 3LL;
|
33
|
+
static const uint32_t U32_TABLE_UPSIZE_DENOM = 4LL;
|
34
34
|
|
35
|
-
static const
|
36
|
-
static const
|
35
|
+
static const uint32_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
|
36
|
+
static const uint32_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
|
37
37
|
|
38
38
|
template<typename A>
|
39
39
|
class u32_table {
|
@@ -42,7 +42,7 @@ public:
|
|
42
42
|
u32_table(const A& allocator);
|
43
43
|
u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
|
44
44
|
|
45
|
-
inline
|
45
|
+
inline uint32_t get_num_items() const;
|
46
46
|
inline const uint32_t* get_slots() const;
|
47
47
|
inline uint8_t get_lg_size() const;
|
48
48
|
inline void clear();
|
@@ -52,7 +52,7 @@ public:
|
|
52
52
|
// returns true iff the item was present and was therefore removed from the table
|
53
53
|
inline bool maybe_delete(uint32_t item);
|
54
54
|
|
55
|
-
static u32_table make_from_pairs(const uint32_t* pairs,
|
55
|
+
static u32_table make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator);
|
56
56
|
|
57
57
|
vector_u32<A> unwrapping_get_items() const;
|
58
58
|
|
@@ -69,10 +69,10 @@ private:
|
|
69
69
|
|
70
70
|
uint8_t lg_size; // log2 of number of slots
|
71
71
|
uint8_t num_valid_bits;
|
72
|
-
|
72
|
+
uint32_t num_items;
|
73
73
|
vector_u32<A> slots;
|
74
74
|
|
75
|
-
inline
|
75
|
+
inline uint32_t lookup(uint32_t item) const;
|
76
76
|
inline void must_insert(uint32_t item);
|
77
77
|
inline void rebuild(uint8_t new_lg_size);
|
78
78
|
};
|
@@ -41,14 +41,14 @@ u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& alloca
|
|
41
41
|
lg_size(lg_size),
|
42
42
|
num_valid_bits(num_valid_bits),
|
43
43
|
num_items(0),
|
44
|
-
slots(
|
44
|
+
slots(1ULL << lg_size, UINT32_MAX, allocator)
|
45
45
|
{
|
46
46
|
if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
|
47
47
|
if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
|
48
48
|
}
|
49
49
|
|
50
50
|
template<typename A>
|
51
|
-
|
51
|
+
uint32_t u32_table<A>::get_num_items() const {
|
52
52
|
return num_items;
|
53
53
|
}
|
54
54
|
|
@@ -70,7 +70,7 @@ void u32_table<A>::clear() {
|
|
70
70
|
|
71
71
|
template<typename A>
|
72
72
|
bool u32_table<A>::maybe_insert(uint32_t item) {
|
73
|
-
const
|
73
|
+
const uint32_t index = lookup(item);
|
74
74
|
if (slots[index] == item) return false;
|
75
75
|
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
76
76
|
slots[index] = item;
|
@@ -83,7 +83,7 @@ bool u32_table<A>::maybe_insert(uint32_t item) {
|
|
83
83
|
|
84
84
|
template<typename A>
|
85
85
|
bool u32_table<A>::maybe_delete(uint32_t item) {
|
86
|
-
const
|
86
|
+
const uint32_t index = lookup(item);
|
87
87
|
if (slots[index] == UINT32_MAX) return false;
|
88
88
|
if (slots[index] != item) throw std::logic_error("item does not exist");
|
89
89
|
if (num_items == 0) throw std::logic_error("delete error");
|
@@ -110,7 +110,7 @@ bool u32_table<A>::maybe_delete(uint32_t item) {
|
|
110
110
|
|
111
111
|
// this one is specifically tailored to be a part of fm85 decompression scheme
|
112
112
|
template<typename A>
|
113
|
-
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs,
|
113
|
+
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator) {
|
114
114
|
uint8_t lg_num_slots = 2;
|
115
115
|
while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
|
116
116
|
u32_table<A> table(lg_num_slots, 6 + lg_k, allocator);
|
@@ -124,11 +124,11 @@ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pai
|
|
124
124
|
}
|
125
125
|
|
126
126
|
template<typename A>
|
127
|
-
|
128
|
-
const
|
129
|
-
const
|
127
|
+
uint32_t u32_table<A>::lookup(uint32_t item) const {
|
128
|
+
const uint32_t size = 1 << lg_size;
|
129
|
+
const uint32_t mask = size - 1;
|
130
130
|
const uint8_t shift = num_valid_bits - lg_size;
|
131
|
-
|
131
|
+
uint32_t probe = item >> shift;
|
132
132
|
if (probe > mask) throw std::logic_error("probe out of range");
|
133
133
|
while (slots[probe] != item && slots[probe] != UINT32_MAX) {
|
134
134
|
probe = (probe + 1) & mask;
|
@@ -139,7 +139,7 @@ size_t u32_table<A>::lookup(uint32_t item) const {
|
|
139
139
|
// counts and resizing must be handled by the caller
|
140
140
|
template<typename A>
|
141
141
|
void u32_table<A>::must_insert(uint32_t item) {
|
142
|
-
const
|
142
|
+
const uint32_t index = lookup(item);
|
143
143
|
if (slots[index] == item) throw std::logic_error("item exists");
|
144
144
|
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
145
145
|
slots[index] = item;
|
@@ -148,13 +148,13 @@ void u32_table<A>::must_insert(uint32_t item) {
|
|
148
148
|
template<typename A>
|
149
149
|
void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
150
150
|
if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
|
151
|
-
const
|
152
|
-
const
|
151
|
+
const uint32_t old_size = 1 << lg_size;
|
152
|
+
const uint32_t new_size = 1 << new_lg_size;
|
153
153
|
if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
|
154
154
|
vector_u32<A> old_slots = std::move(slots);
|
155
155
|
slots = vector_u32<A>(new_size, UINT32_MAX, old_slots.get_allocator());
|
156
156
|
lg_size = new_lg_size;
|
157
|
-
for (
|
157
|
+
for (uint32_t i = 0; i < old_size; i++) {
|
158
158
|
if (old_slots[i] != UINT32_MAX) {
|
159
159
|
must_insert(old_slots[i]);
|
160
160
|
}
|
@@ -170,7 +170,7 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
|
170
170
|
template<typename A>
|
171
171
|
vector_u32<A> u32_table<A>::unwrapping_get_items() const {
|
172
172
|
if (num_items == 0) return vector_u32<A>(slots.get_allocator());
|
173
|
-
const
|
173
|
+
const uint32_t table_size = 1 << lg_size;
|
174
174
|
vector_u32<A> result(num_items, 0, slots.get_allocator());
|
175
175
|
size_t i = 0;
|
176
176
|
size_t l = 0;
|
@@ -27,38 +27,38 @@ namespace datasketches {
|
|
27
27
|
typedef u32_table<std::allocator<void>> table;
|
28
28
|
|
29
29
|
TEST_CASE("cpc sketch: compress and decompress pairs", "[cpc_sketch]") {
|
30
|
-
const
|
31
|
-
const
|
30
|
+
const size_t N = 200;
|
31
|
+
const size_t MAXWORDS = 1000;
|
32
32
|
|
33
33
|
HashState twoHashes;
|
34
34
|
uint32_t pairArray[N];
|
35
35
|
uint32_t pairArray2[N];
|
36
36
|
uint64_t value = 35538947; // some arbitrary starting value
|
37
37
|
const uint64_t golden64 = 0x9e3779b97f4a7c13ULL; // the golden ratio
|
38
|
-
for (
|
38
|
+
for (size_t i = 0; i < N; i++) {
|
39
39
|
MurmurHash3_x64_128(&value, sizeof(value), 0, twoHashes);
|
40
40
|
uint32_t rand = twoHashes.h1 & 0xffff;
|
41
41
|
pairArray[i] = rand;
|
42
42
|
value += golden64;
|
43
43
|
}
|
44
44
|
//table::knuth_shell_sort3(pairArray, 0, N - 1); // unsigned numerical sort
|
45
|
-
std::sort(pairArray,
|
45
|
+
std::sort(pairArray, pairArray + N);
|
46
46
|
uint32_t prev = UINT32_MAX;
|
47
|
-
|
48
|
-
for (
|
47
|
+
uint32_t nxt = 0;
|
48
|
+
for (size_t i = 0; i < N; i++) { // uniquify
|
49
49
|
if (pairArray[i] != prev) {
|
50
50
|
prev = pairArray[i];
|
51
51
|
pairArray[nxt++] = pairArray[i];
|
52
52
|
}
|
53
53
|
}
|
54
|
-
|
54
|
+
uint32_t numPairs = nxt;
|
55
55
|
|
56
56
|
uint32_t compressedWords[MAXWORDS];
|
57
57
|
|
58
|
-
for (
|
59
|
-
|
58
|
+
for (uint8_t numBaseBits = 0; numBaseBits <= 11; numBaseBits++) {
|
59
|
+
uint32_t numWordsWritten = get_compressor<std::allocator<void>>().low_level_compress_pairs(pairArray, numPairs, numBaseBits, compressedWords);
|
60
60
|
get_compressor<std::allocator<void>>().low_level_uncompress_pairs(pairArray2, numPairs, numBaseBits, compressedWords, numWordsWritten);
|
61
|
-
for (
|
61
|
+
for (size_t i = 0; i < numPairs; i++) {
|
62
62
|
REQUIRE(pairArray[i] == pairArray2[i]);
|
63
63
|
}
|
64
64
|
}
|
@@ -283,6 +283,26 @@ TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
|
|
283
283
|
REQUIRE(deserialized.validate());
|
284
284
|
}
|
285
285
|
|
286
|
+
TEST_CASE("cpc sketch: serialize deserialize sliding huge", "[cpc_sketch]") {
|
287
|
+
cpc_sketch sketch(26);
|
288
|
+
const int n = 10000000;
|
289
|
+
for (int i = 0; i < n; i++) sketch.update(i);
|
290
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.001));
|
291
|
+
auto bytes = sketch.serialize();
|
292
|
+
cpc_sketch deserialized = cpc_sketch::deserialize(bytes.data(), bytes.size());
|
293
|
+
REQUIRE(deserialized.is_empty() == sketch.is_empty());
|
294
|
+
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
295
|
+
REQUIRE(deserialized.validate());
|
296
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 7), std::out_of_range);
|
297
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 15), std::out_of_range);
|
298
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
299
|
+
|
300
|
+
// updating again with the same values should not change the sketch
|
301
|
+
for (int i = 0; i < n; i++) deserialized.update(i);
|
302
|
+
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
303
|
+
REQUIRE(deserialized.validate());
|
304
|
+
}
|
305
|
+
|
286
306
|
TEST_CASE("cpc sketch: copy", "[cpc_sketch]") {
|
287
307
|
cpc_sketch s1(11);
|
288
308
|
s1.update(1);
|
@@ -378,4 +398,9 @@ TEST_CASE("cpc sketch: update string equivalence", "[cpc_sketch]") {
|
|
378
398
|
REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
|
379
399
|
}
|
380
400
|
|
401
|
+
TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
|
402
|
+
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
|
403
|
+
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
|
404
|
+
}
|
405
|
+
|
381
406
|
} /* namespace datasketches */
|
@@ -65,7 +65,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
65
65
|
void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
|
66
66
|
if (other.is_empty()) return;
|
67
67
|
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
68
|
-
for (auto
|
68
|
+
for (auto it: other.map) {
|
69
69
|
update(it.first, it.second);
|
70
70
|
}
|
71
71
|
offset += other.offset;
|
@@ -76,7 +76,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
76
76
|
void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
|
77
77
|
if (other.is_empty()) return;
|
78
78
|
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
79
|
-
for (auto
|
79
|
+
for (auto it: other.map) {
|
80
80
|
update(std::move(it.first), it.second);
|
81
81
|
}
|
82
82
|
offset += other.offset;
|
@@ -147,7 +147,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
147
147
|
typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
|
148
148
|
frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
|
149
149
|
vector_row items(map.get_allocator());
|
150
|
-
for (auto
|
150
|
+
for (auto it: map) {
|
151
151
|
const W lb = it.second;
|
152
152
|
const W ub = it.second + offset;
|
153
153
|
if ((err_type == NO_FALSE_NEGATIVES && ub > threshold) || (err_type == NO_FALSE_POSITIVES && lb > threshold)) {
|
@@ -162,28 +162,28 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
|
|
162
162
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
163
163
|
void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const {
|
164
164
|
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
165
|
-
|
165
|
+
write(os, preamble_longs);
|
166
166
|
const uint8_t serial_version = SERIAL_VERSION;
|
167
|
-
|
167
|
+
write(os, serial_version);
|
168
168
|
const uint8_t family = FAMILY_ID;
|
169
|
-
|
169
|
+
write(os, family);
|
170
170
|
const uint8_t lg_max_size = map.get_lg_max_size();
|
171
|
-
|
171
|
+
write(os, lg_max_size);
|
172
172
|
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
173
|
-
|
173
|
+
write(os, lg_cur_size);
|
174
174
|
const uint8_t flags_byte(
|
175
175
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
176
176
|
);
|
177
|
-
|
177
|
+
write(os, flags_byte);
|
178
178
|
const uint16_t unused16 = 0;
|
179
|
-
|
179
|
+
write(os, unused16);
|
180
180
|
if (!is_empty()) {
|
181
181
|
const uint32_t num_items = map.get_num_active();
|
182
|
-
|
182
|
+
write(os, num_items);
|
183
183
|
const uint32_t unused32 = 0;
|
184
|
-
|
185
|
-
|
186
|
-
|
184
|
+
write(os, unused32);
|
185
|
+
write(os, total_weight);
|
186
|
+
write(os, offset);
|
187
187
|
|
188
188
|
// copy active items and their weights to use batch serialization
|
189
189
|
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
@@ -192,14 +192,14 @@ void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const
|
|
192
192
|
A alloc(map.get_allocator());
|
193
193
|
T* items = alloc.allocate(num_items);
|
194
194
|
uint32_t i = 0;
|
195
|
-
for (auto
|
195
|
+
for (auto it: map) {
|
196
196
|
new (&items[i]) T(it.first);
|
197
197
|
weights[i++] = it.second;
|
198
198
|
}
|
199
|
-
|
199
|
+
write(os, weights, sizeof(W) * num_items);
|
200
200
|
aw.deallocate(weights, num_items);
|
201
201
|
S().serialize(os, items, num_items);
|
202
|
-
for (
|
202
|
+
for (i = 0; i < num_items; i++) items[i].~T();
|
203
203
|
alloc.deallocate(items, num_items);
|
204
204
|
}
|
205
205
|
}
|
@@ -208,7 +208,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
208
208
|
size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes() const {
|
209
209
|
if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
|
210
210
|
size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
|
211
|
-
for (auto
|
211
|
+
for (auto it: map) size += S().size_of_item(it.first);
|
212
212
|
return size;
|
213
213
|
}
|
214
214
|
|
@@ -220,28 +220,26 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
|
|
220
220
|
uint8_t* end_ptr = ptr + size;
|
221
221
|
|
222
222
|
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
223
|
-
ptr += copy_to_mem(
|
223
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
224
224
|
const uint8_t serial_version = SERIAL_VERSION;
|
225
|
-
ptr += copy_to_mem(
|
225
|
+
ptr += copy_to_mem(serial_version, ptr);
|
226
226
|
const uint8_t family = FAMILY_ID;
|
227
|
-
ptr += copy_to_mem(
|
227
|
+
ptr += copy_to_mem(family, ptr);
|
228
228
|
const uint8_t lg_max_size = map.get_lg_max_size();
|
229
|
-
ptr += copy_to_mem(
|
229
|
+
ptr += copy_to_mem(lg_max_size, ptr);
|
230
230
|
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
231
|
-
ptr += copy_to_mem(
|
231
|
+
ptr += copy_to_mem(lg_cur_size, ptr);
|
232
232
|
const uint8_t flags_byte(
|
233
233
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
234
234
|
);
|
235
|
-
ptr += copy_to_mem(
|
236
|
-
|
237
|
-
ptr += copy_to_mem(&unused16, ptr, sizeof(uint16_t));
|
235
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
236
|
+
ptr += sizeof(uint16_t); // unused
|
238
237
|
if (!is_empty()) {
|
239
238
|
const uint32_t num_items = map.get_num_active();
|
240
|
-
ptr += copy_to_mem(
|
241
|
-
|
242
|
-
ptr += copy_to_mem(
|
243
|
-
ptr += copy_to_mem(
|
244
|
-
ptr += copy_to_mem(&offset, ptr, sizeof(offset));
|
239
|
+
ptr += copy_to_mem(num_items, ptr);
|
240
|
+
ptr += sizeof(uint32_t); // unused
|
241
|
+
ptr += copy_to_mem(total_weight, ptr);
|
242
|
+
ptr += copy_to_mem(offset, ptr);
|
245
243
|
|
246
244
|
// copy active items and their weights to use batch serialization
|
247
245
|
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
@@ -250,7 +248,7 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
|
|
250
248
|
A alloc(map.get_allocator());
|
251
249
|
T* items = alloc.allocate(num_items);
|
252
250
|
uint32_t i = 0;
|
253
|
-
for (auto
|
251
|
+
for (auto it: map) {
|
254
252
|
new (&items[i]) T(it.first);
|
255
253
|
weights[i++] = it.second;
|
256
254
|
}
|
@@ -258,7 +256,7 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
|
|
258
256
|
aw.deallocate(weights, num_items);
|
259
257
|
const size_t bytes_remaining = end_ptr - ptr;
|
260
258
|
ptr += S().serialize(ptr, bytes_remaining, items, num_items);
|
261
|
-
for (
|
259
|
+
for (i = 0; i < num_items; i++) items[i].~T();
|
262
260
|
alloc.deallocate(items, num_items);
|
263
261
|
}
|
264
262
|
return bytes;
|
@@ -268,38 +266,31 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
268
266
|
class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
|
269
267
|
public:
|
270
268
|
items_deleter(uint32_t num, bool destroy, const A& allocator):
|
271
|
-
|
272
|
-
void set_destroy(bool destroy) {
|
269
|
+
allocator_(allocator), num_(num), destroy_(destroy) {}
|
270
|
+
void set_destroy(bool destroy) { destroy_ = destroy; }
|
273
271
|
void operator() (T* ptr) {
|
274
272
|
if (ptr != nullptr) {
|
275
|
-
if (
|
276
|
-
for (uint32_t i = 0; i <
|
273
|
+
if (destroy_) {
|
274
|
+
for (uint32_t i = 0; i < num_; ++i) ptr[i].~T();
|
277
275
|
}
|
278
|
-
|
276
|
+
allocator_.deallocate(ptr, num_);
|
279
277
|
}
|
280
278
|
}
|
281
279
|
private:
|
282
|
-
A
|
283
|
-
uint32_t
|
284
|
-
bool
|
280
|
+
A allocator_;
|
281
|
+
uint32_t num_;
|
282
|
+
bool destroy_;
|
285
283
|
};
|
286
284
|
|
287
285
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
288
286
|
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
|
289
|
-
|
290
|
-
|
291
|
-
uint8_t
|
292
|
-
|
293
|
-
uint8_t
|
294
|
-
|
295
|
-
|
296
|
-
is.read((char*)&lg_max_size, sizeof(lg_max_size));
|
297
|
-
uint8_t lg_cur_size;
|
298
|
-
is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
|
299
|
-
uint8_t flags_byte;
|
300
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
301
|
-
uint16_t unused16;
|
302
|
-
is.read((char*)&unused16, sizeof(unused16));
|
287
|
+
const auto preamble_longs = read<uint8_t>(is);
|
288
|
+
const auto serial_version = read<uint8_t>(is);
|
289
|
+
const auto family_id = read<uint8_t>(is);
|
290
|
+
const auto lg_max_size = read<uint8_t>(is);
|
291
|
+
const auto lg_cur_size = read<uint8_t>(is);
|
292
|
+
const auto flags_byte = read<uint8_t>(is);
|
293
|
+
read<uint16_t>(is); // unused
|
303
294
|
|
304
295
|
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
305
296
|
|
@@ -310,19 +301,15 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
310
301
|
|
311
302
|
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
|
312
303
|
if (!is_empty) {
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
W total_weight;
|
318
|
-
is.read((char*)&total_weight, sizeof(total_weight));
|
319
|
-
W offset;
|
320
|
-
is.read((char*)&offset, sizeof(offset));
|
304
|
+
const auto num_items = read<uint32_t>(is);
|
305
|
+
read<uint32_t>(is); // unused
|
306
|
+
const auto total_weight = read<W>(is);
|
307
|
+
const auto offset = read<W>(is);
|
321
308
|
|
322
309
|
// batch deserialization with intermediate array of items and weights
|
323
310
|
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
324
311
|
std::vector<W, AllocW> weights(num_items, 0, allocator);
|
325
|
-
|
312
|
+
read(is, weights.data(), sizeof(W) * num_items);
|
326
313
|
A alloc(allocator);
|
327
314
|
std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(num_items, false, alloc));
|
328
315
|
S().deserialize(is, items.get(), num_items);
|
@@ -344,19 +331,18 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
344
331
|
const char* ptr = static_cast<const char*>(bytes);
|
345
332
|
const char* base = static_cast<const char*>(bytes);
|
346
333
|
uint8_t preamble_longs;
|
347
|
-
ptr += copy_from_mem(ptr,
|
334
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
348
335
|
uint8_t serial_version;
|
349
|
-
ptr += copy_from_mem(ptr,
|
336
|
+
ptr += copy_from_mem(ptr, serial_version);
|
350
337
|
uint8_t family_id;
|
351
|
-
ptr += copy_from_mem(ptr,
|
338
|
+
ptr += copy_from_mem(ptr, family_id);
|
352
339
|
uint8_t lg_max_size;
|
353
|
-
ptr += copy_from_mem(ptr,
|
340
|
+
ptr += copy_from_mem(ptr, lg_max_size);
|
354
341
|
uint8_t lg_cur_size;
|
355
|
-
ptr += copy_from_mem(ptr,
|
342
|
+
ptr += copy_from_mem(ptr, lg_cur_size);
|
356
343
|
uint8_t flags_byte;
|
357
|
-
ptr += copy_from_mem(ptr,
|
358
|
-
uint16_t
|
359
|
-
ptr += copy_from_mem(ptr, &unused16, sizeof(uint16_t));
|
344
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
345
|
+
ptr += sizeof(uint16_t); // unused
|
360
346
|
|
361
347
|
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
362
348
|
|
@@ -364,18 +350,17 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
364
350
|
check_serial_version(serial_version);
|
365
351
|
check_family_id(family_id);
|
366
352
|
check_size(lg_cur_size, lg_max_size);
|
367
|
-
ensure_minimum_memory(size,
|
353
|
+
ensure_minimum_memory(size, 1ULL << preamble_longs);
|
368
354
|
|
369
355
|
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
|
370
356
|
if (!is_empty) {
|
371
357
|
uint32_t num_items;
|
372
|
-
ptr += copy_from_mem(ptr,
|
373
|
-
uint32_t
|
374
|
-
ptr += copy_from_mem(ptr, &unused32, sizeof(uint32_t));
|
358
|
+
ptr += copy_from_mem(ptr, num_items);
|
359
|
+
ptr += sizeof(uint32_t); // unused
|
375
360
|
W total_weight;
|
376
|
-
ptr += copy_from_mem(ptr,
|
361
|
+
ptr += copy_from_mem(ptr, total_weight);
|
377
362
|
W offset;
|
378
|
-
ptr += copy_from_mem(ptr,
|
363
|
+
ptr += copy_from_mem(ptr, offset);
|
379
364
|
|
380
365
|
ensure_minimum_memory(size, ptr - base + (sizeof(W) * num_items));
|
381
366
|
// batch deserialization with intermediate array of items and weights
|
@@ -446,14 +431,14 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
|
|
446
431
|
os << "### End sketch summary" << std::endl;
|
447
432
|
if (print_items) {
|
448
433
|
vector_row items;
|
449
|
-
for (auto
|
434
|
+
for (auto it: map) {
|
450
435
|
items.push_back(row(&it.first, it.second, offset));
|
451
436
|
}
|
452
437
|
// sort by estimate in descending order
|
453
438
|
std::sort(items.begin(), items.end(), [](row a, row b){ return a.get_estimate() > b.get_estimate(); });
|
454
439
|
os << "### Items in descending order by estimate" << std::endl;
|
455
440
|
os << " item, estimate, lower bound, upper bound" << std::endl;
|
456
|
-
for (auto
|
441
|
+
for (auto it: items) {
|
457
442
|
os << " " << it.get_item() << ", " << it.get_estimate() << ", "
|
458
443
|
<< it.get_lower_bound() << ", " << it.get_upper_bound() << std::endl;
|
459
444
|
}
|