datasketches 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
|
@@ -191,8 +191,8 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
|
|
|
191
191
|
|
|
192
192
|
template<typename A>
|
|
193
193
|
cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
|
|
194
|
-
const
|
|
195
|
-
const
|
|
194
|
+
const uint32_t k = 1 << lg_k;
|
|
195
|
+
const uint32_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
|
|
196
196
|
|
|
197
197
|
const auto flavor = cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons);
|
|
198
198
|
if (flavor != cpc_sketch_alloc<A>::flavor::HYBRID && flavor != cpc_sketch_alloc<A>::flavor::PINNED
|
|
@@ -215,7 +215,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
|
|
|
215
215
|
|
|
216
216
|
// The snowplow effect was caused by processing the rows in order,
|
|
217
217
|
// but we have fixed it by using a sufficiently large hash table.
|
|
218
|
-
for (
|
|
218
|
+
for (uint32_t i = 0; i < k; i++) {
|
|
219
219
|
uint64_t pattern = bit_matrix[i];
|
|
220
220
|
sliding_window[i] = (pattern >> offset) & 0xff;
|
|
221
221
|
pattern &= mask_for_clearing_window;
|
|
@@ -250,17 +250,17 @@ void cpc_union_alloc<A>::switch_to_bit_matrix() {
|
|
|
250
250
|
template<typename A>
|
|
251
251
|
void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
|
|
252
252
|
const uint32_t* slots = table.get_slots();
|
|
253
|
-
const
|
|
253
|
+
const uint32_t num_slots = 1 << table.get_lg_size();
|
|
254
254
|
const uint64_t dst_mask = (((1 << accumulator->get_lg_k()) - 1) << 6) | 63; // downsamples when dst lgK < src LgK
|
|
255
255
|
|
|
256
256
|
// Using a golden ratio stride fixes the snowplow effect.
|
|
257
257
|
const double golden = 0.6180339887498949025;
|
|
258
|
-
|
|
258
|
+
uint32_t stride = static_cast<uint32_t>(golden * static_cast<double>(num_slots));
|
|
259
259
|
if (stride < 2) throw std::logic_error("stride < 2");
|
|
260
260
|
if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd
|
|
261
261
|
if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range");
|
|
262
262
|
|
|
263
|
-
for (
|
|
263
|
+
for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) {
|
|
264
264
|
j &= num_slots - 1;
|
|
265
265
|
const uint32_t row_col = slots[j];
|
|
266
266
|
if (row_col != UINT32_MAX) {
|
|
@@ -272,13 +272,13 @@ void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
|
|
|
272
272
|
template<typename A>
|
|
273
273
|
void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
|
|
274
274
|
const uint32_t* slots = table.get_slots();
|
|
275
|
-
const
|
|
275
|
+
const uint32_t num_slots = 1 << table.get_lg_size();
|
|
276
276
|
const uint64_t dest_mask = (1 << lg_k) - 1; // downsamples when dst lgK < sr LgK
|
|
277
|
-
for (
|
|
277
|
+
for (uint32_t i = 0; i < num_slots; i++) {
|
|
278
278
|
const uint32_t row_col = slots[i];
|
|
279
279
|
if (row_col != UINT32_MAX) {
|
|
280
280
|
const uint8_t col = row_col & 63;
|
|
281
|
-
const
|
|
281
|
+
const uint32_t row = row_col >> 6;
|
|
282
282
|
bit_matrix[row & dest_mask] |= static_cast<uint64_t>(1) << col; // set the bit
|
|
283
283
|
}
|
|
284
284
|
}
|
|
@@ -288,8 +288,8 @@ template<typename A>
|
|
|
288
288
|
void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k) {
|
|
289
289
|
if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
|
|
290
290
|
const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
|
|
291
|
-
const
|
|
292
|
-
for (
|
|
291
|
+
const uint32_t src_k = 1 << src_lg_k;
|
|
292
|
+
for (uint32_t src_row = 0; src_row < src_k; src_row++) {
|
|
293
293
|
bit_matrix[src_row & dst_mask] |= static_cast<uint64_t>(sliding_window[src_row]) << offset;
|
|
294
294
|
}
|
|
295
295
|
}
|
|
@@ -298,8 +298,8 @@ template<typename A>
|
|
|
298
298
|
void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k) {
|
|
299
299
|
if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
|
|
300
300
|
const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
|
|
301
|
-
const
|
|
302
|
-
for (
|
|
301
|
+
const uint32_t src_k = 1 << src_lg_k;
|
|
302
|
+
for (uint32_t src_row = 0; src_row < src_k; src_row++) {
|
|
303
303
|
bit_matrix[src_row & dst_mask] |= src_matrix[src_row];
|
|
304
304
|
}
|
|
305
305
|
}
|
|
@@ -313,7 +313,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
|
|
|
313
313
|
if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
|
|
314
314
|
vector_u64<A> old_matrix = std::move(bit_matrix);
|
|
315
315
|
const uint8_t old_lg_k = lg_k;
|
|
316
|
-
const
|
|
316
|
+
const uint32_t new_k = 1 << new_lg_k;
|
|
317
317
|
bit_matrix = vector_u64<A>(new_k, 0, old_matrix.get_allocator());
|
|
318
318
|
lg_k = new_lg_k;
|
|
319
319
|
or_matrix_into_matrix(old_matrix, old_lg_k);
|
|
@@ -31,9 +31,9 @@ static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
|
|
|
31
31
|
else return quotient + 1;
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
-
static inline
|
|
35
|
-
if (x < 1) throw std::invalid_argument("
|
|
36
|
-
|
|
34
|
+
static inline uint8_t floor_log2_of_long(uint64_t x) {
|
|
35
|
+
if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument");
|
|
36
|
+
uint8_t p = 0;
|
|
37
37
|
uint64_t y = 1;
|
|
38
38
|
while (true) {
|
|
39
39
|
if (y == x) return p;
|
|
@@ -69,7 +69,7 @@ static inline uint64_t wegner_count_bits_set_in_matrix(const uint64_t* array, si
|
|
|
69
69
|
// Note: this is an adaptation of the Java code,
|
|
70
70
|
// which is apparently a variation of Figure 5-2 in "Hacker's Delight"
|
|
71
71
|
// by Henry S. Warren.
|
|
72
|
-
static inline
|
|
72
|
+
static inline uint32_t warren_bit_count(uint64_t i) {
|
|
73
73
|
i = i - ((i >> 1) & 0x5555555555555555ULL);
|
|
74
74
|
i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
|
|
75
75
|
i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
|
|
@@ -79,9 +79,9 @@ static inline uint64_t warren_bit_count(uint64_t i) {
|
|
|
79
79
|
return i & 0x7f;
|
|
80
80
|
}
|
|
81
81
|
|
|
82
|
-
static inline
|
|
83
|
-
|
|
84
|
-
for (
|
|
82
|
+
static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, uint32_t length) {
|
|
83
|
+
uint32_t count = 0;
|
|
84
|
+
for (uint32_t i = 0; i < length; i++) {
|
|
85
85
|
count += warren_bit_count(array[i]);
|
|
86
86
|
}
|
|
87
87
|
return count;
|
|
@@ -91,13 +91,13 @@ static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, si
|
|
|
91
91
|
|
|
92
92
|
#define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
|
|
93
93
|
|
|
94
|
-
static inline
|
|
94
|
+
static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
|
|
95
95
|
if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
|
|
96
|
-
|
|
96
|
+
uint32_t total = 0;
|
|
97
97
|
uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights;
|
|
98
98
|
fours = twos = ones = 0;
|
|
99
99
|
|
|
100
|
-
for (
|
|
100
|
+
for (uint32_t i = 0; i <= length - 8; i += 8) {
|
|
101
101
|
CSA(twos_a, ones, ones, a[i+0], a[i+1]);
|
|
102
102
|
CSA(twos_b, ones, ones, a[i+2], a[i+3]);
|
|
103
103
|
CSA(fours_a, twos, twos, twos_a, twos_b);
|
|
@@ -245,12 +245,12 @@ static inline double icon_exponential_approximation(double k, double c) {
|
|
|
245
245
|
return (0.7940236163830469 * k * pow(2.0, c / k));
|
|
246
246
|
}
|
|
247
247
|
|
|
248
|
-
static inline double compute_icon_estimate(uint8_t lg_k,
|
|
248
|
+
static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) {
|
|
249
249
|
if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
|
|
250
250
|
if (c < 2) return ((c == 0) ? 0.0 : 1.0);
|
|
251
|
-
const
|
|
252
|
-
const double double_k = k;
|
|
253
|
-
const double double_c = c;
|
|
251
|
+
const uint32_t k = 1 << lg_k;
|
|
252
|
+
const double double_k = static_cast<double>(k);
|
|
253
|
+
const double double_c = static_cast<double>(c);
|
|
254
254
|
// Differing thresholds ensure that the approximated estimator is monotonically increasing.
|
|
255
255
|
const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
|
|
256
256
|
if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
|
|
@@ -29,11 +29,11 @@
|
|
|
29
29
|
|
|
30
30
|
namespace datasketches {
|
|
31
31
|
|
|
32
|
-
static const
|
|
33
|
-
static const
|
|
32
|
+
static const uint32_t U32_TABLE_UPSIZE_NUMER = 3LL;
|
|
33
|
+
static const uint32_t U32_TABLE_UPSIZE_DENOM = 4LL;
|
|
34
34
|
|
|
35
|
-
static const
|
|
36
|
-
static const
|
|
35
|
+
static const uint32_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
|
|
36
|
+
static const uint32_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
|
|
37
37
|
|
|
38
38
|
template<typename A>
|
|
39
39
|
class u32_table {
|
|
@@ -42,7 +42,7 @@ public:
|
|
|
42
42
|
u32_table(const A& allocator);
|
|
43
43
|
u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
|
|
44
44
|
|
|
45
|
-
inline
|
|
45
|
+
inline uint32_t get_num_items() const;
|
|
46
46
|
inline const uint32_t* get_slots() const;
|
|
47
47
|
inline uint8_t get_lg_size() const;
|
|
48
48
|
inline void clear();
|
|
@@ -52,7 +52,7 @@ public:
|
|
|
52
52
|
// returns true iff the item was present and was therefore removed from the table
|
|
53
53
|
inline bool maybe_delete(uint32_t item);
|
|
54
54
|
|
|
55
|
-
static u32_table make_from_pairs(const uint32_t* pairs,
|
|
55
|
+
static u32_table make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator);
|
|
56
56
|
|
|
57
57
|
vector_u32<A> unwrapping_get_items() const;
|
|
58
58
|
|
|
@@ -69,10 +69,10 @@ private:
|
|
|
69
69
|
|
|
70
70
|
uint8_t lg_size; // log2 of number of slots
|
|
71
71
|
uint8_t num_valid_bits;
|
|
72
|
-
|
|
72
|
+
uint32_t num_items;
|
|
73
73
|
vector_u32<A> slots;
|
|
74
74
|
|
|
75
|
-
inline
|
|
75
|
+
inline uint32_t lookup(uint32_t item) const;
|
|
76
76
|
inline void must_insert(uint32_t item);
|
|
77
77
|
inline void rebuild(uint8_t new_lg_size);
|
|
78
78
|
};
|
|
@@ -41,14 +41,14 @@ u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& alloca
|
|
|
41
41
|
lg_size(lg_size),
|
|
42
42
|
num_valid_bits(num_valid_bits),
|
|
43
43
|
num_items(0),
|
|
44
|
-
slots(
|
|
44
|
+
slots(1ULL << lg_size, UINT32_MAX, allocator)
|
|
45
45
|
{
|
|
46
46
|
if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
|
|
47
47
|
if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
template<typename A>
|
|
51
|
-
|
|
51
|
+
uint32_t u32_table<A>::get_num_items() const {
|
|
52
52
|
return num_items;
|
|
53
53
|
}
|
|
54
54
|
|
|
@@ -70,7 +70,7 @@ void u32_table<A>::clear() {
|
|
|
70
70
|
|
|
71
71
|
template<typename A>
|
|
72
72
|
bool u32_table<A>::maybe_insert(uint32_t item) {
|
|
73
|
-
const
|
|
73
|
+
const uint32_t index = lookup(item);
|
|
74
74
|
if (slots[index] == item) return false;
|
|
75
75
|
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
|
76
76
|
slots[index] = item;
|
|
@@ -83,7 +83,7 @@ bool u32_table<A>::maybe_insert(uint32_t item) {
|
|
|
83
83
|
|
|
84
84
|
template<typename A>
|
|
85
85
|
bool u32_table<A>::maybe_delete(uint32_t item) {
|
|
86
|
-
const
|
|
86
|
+
const uint32_t index = lookup(item);
|
|
87
87
|
if (slots[index] == UINT32_MAX) return false;
|
|
88
88
|
if (slots[index] != item) throw std::logic_error("item does not exist");
|
|
89
89
|
if (num_items == 0) throw std::logic_error("delete error");
|
|
@@ -110,7 +110,7 @@ bool u32_table<A>::maybe_delete(uint32_t item) {
|
|
|
110
110
|
|
|
111
111
|
// this one is specifically tailored to be a part of fm85 decompression scheme
|
|
112
112
|
template<typename A>
|
|
113
|
-
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs,
|
|
113
|
+
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator) {
|
|
114
114
|
uint8_t lg_num_slots = 2;
|
|
115
115
|
while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
|
|
116
116
|
u32_table<A> table(lg_num_slots, 6 + lg_k, allocator);
|
|
@@ -124,11 +124,11 @@ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pai
|
|
|
124
124
|
}
|
|
125
125
|
|
|
126
126
|
template<typename A>
|
|
127
|
-
|
|
128
|
-
const
|
|
129
|
-
const
|
|
127
|
+
uint32_t u32_table<A>::lookup(uint32_t item) const {
|
|
128
|
+
const uint32_t size = 1 << lg_size;
|
|
129
|
+
const uint32_t mask = size - 1;
|
|
130
130
|
const uint8_t shift = num_valid_bits - lg_size;
|
|
131
|
-
|
|
131
|
+
uint32_t probe = item >> shift;
|
|
132
132
|
if (probe > mask) throw std::logic_error("probe out of range");
|
|
133
133
|
while (slots[probe] != item && slots[probe] != UINT32_MAX) {
|
|
134
134
|
probe = (probe + 1) & mask;
|
|
@@ -139,7 +139,7 @@ size_t u32_table<A>::lookup(uint32_t item) const {
|
|
|
139
139
|
// counts and resizing must be handled by the caller
|
|
140
140
|
template<typename A>
|
|
141
141
|
void u32_table<A>::must_insert(uint32_t item) {
|
|
142
|
-
const
|
|
142
|
+
const uint32_t index = lookup(item);
|
|
143
143
|
if (slots[index] == item) throw std::logic_error("item exists");
|
|
144
144
|
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
|
145
145
|
slots[index] = item;
|
|
@@ -148,13 +148,13 @@ void u32_table<A>::must_insert(uint32_t item) {
|
|
|
148
148
|
template<typename A>
|
|
149
149
|
void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
|
150
150
|
if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
|
|
151
|
-
const
|
|
152
|
-
const
|
|
151
|
+
const uint32_t old_size = 1 << lg_size;
|
|
152
|
+
const uint32_t new_size = 1 << new_lg_size;
|
|
153
153
|
if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
|
|
154
154
|
vector_u32<A> old_slots = std::move(slots);
|
|
155
155
|
slots = vector_u32<A>(new_size, UINT32_MAX, old_slots.get_allocator());
|
|
156
156
|
lg_size = new_lg_size;
|
|
157
|
-
for (
|
|
157
|
+
for (uint32_t i = 0; i < old_size; i++) {
|
|
158
158
|
if (old_slots[i] != UINT32_MAX) {
|
|
159
159
|
must_insert(old_slots[i]);
|
|
160
160
|
}
|
|
@@ -170,7 +170,7 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
|
|
170
170
|
template<typename A>
|
|
171
171
|
vector_u32<A> u32_table<A>::unwrapping_get_items() const {
|
|
172
172
|
if (num_items == 0) return vector_u32<A>(slots.get_allocator());
|
|
173
|
-
const
|
|
173
|
+
const uint32_t table_size = 1 << lg_size;
|
|
174
174
|
vector_u32<A> result(num_items, 0, slots.get_allocator());
|
|
175
175
|
size_t i = 0;
|
|
176
176
|
size_t l = 0;
|
|
@@ -27,38 +27,38 @@ namespace datasketches {
|
|
|
27
27
|
typedef u32_table<std::allocator<void>> table;
|
|
28
28
|
|
|
29
29
|
TEST_CASE("cpc sketch: compress and decompress pairs", "[cpc_sketch]") {
|
|
30
|
-
const
|
|
31
|
-
const
|
|
30
|
+
const size_t N = 200;
|
|
31
|
+
const size_t MAXWORDS = 1000;
|
|
32
32
|
|
|
33
33
|
HashState twoHashes;
|
|
34
34
|
uint32_t pairArray[N];
|
|
35
35
|
uint32_t pairArray2[N];
|
|
36
36
|
uint64_t value = 35538947; // some arbitrary starting value
|
|
37
37
|
const uint64_t golden64 = 0x9e3779b97f4a7c13ULL; // the golden ratio
|
|
38
|
-
for (
|
|
38
|
+
for (size_t i = 0; i < N; i++) {
|
|
39
39
|
MurmurHash3_x64_128(&value, sizeof(value), 0, twoHashes);
|
|
40
40
|
uint32_t rand = twoHashes.h1 & 0xffff;
|
|
41
41
|
pairArray[i] = rand;
|
|
42
42
|
value += golden64;
|
|
43
43
|
}
|
|
44
44
|
//table::knuth_shell_sort3(pairArray, 0, N - 1); // unsigned numerical sort
|
|
45
|
-
std::sort(pairArray,
|
|
45
|
+
std::sort(pairArray, pairArray + N);
|
|
46
46
|
uint32_t prev = UINT32_MAX;
|
|
47
|
-
|
|
48
|
-
for (
|
|
47
|
+
uint32_t nxt = 0;
|
|
48
|
+
for (size_t i = 0; i < N; i++) { // uniquify
|
|
49
49
|
if (pairArray[i] != prev) {
|
|
50
50
|
prev = pairArray[i];
|
|
51
51
|
pairArray[nxt++] = pairArray[i];
|
|
52
52
|
}
|
|
53
53
|
}
|
|
54
|
-
|
|
54
|
+
uint32_t numPairs = nxt;
|
|
55
55
|
|
|
56
56
|
uint32_t compressedWords[MAXWORDS];
|
|
57
57
|
|
|
58
|
-
for (
|
|
59
|
-
|
|
58
|
+
for (uint8_t numBaseBits = 0; numBaseBits <= 11; numBaseBits++) {
|
|
59
|
+
uint32_t numWordsWritten = get_compressor<std::allocator<void>>().low_level_compress_pairs(pairArray, numPairs, numBaseBits, compressedWords);
|
|
60
60
|
get_compressor<std::allocator<void>>().low_level_uncompress_pairs(pairArray2, numPairs, numBaseBits, compressedWords, numWordsWritten);
|
|
61
|
-
for (
|
|
61
|
+
for (size_t i = 0; i < numPairs; i++) {
|
|
62
62
|
REQUIRE(pairArray[i] == pairArray2[i]);
|
|
63
63
|
}
|
|
64
64
|
}
|
|
@@ -283,6 +283,26 @@ TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
|
|
|
283
283
|
REQUIRE(deserialized.validate());
|
|
284
284
|
}
|
|
285
285
|
|
|
286
|
+
TEST_CASE("cpc sketch: serialize deserialize sliding huge", "[cpc_sketch]") {
|
|
287
|
+
cpc_sketch sketch(26);
|
|
288
|
+
const int n = 10000000;
|
|
289
|
+
for (int i = 0; i < n; i++) sketch.update(i);
|
|
290
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.001));
|
|
291
|
+
auto bytes = sketch.serialize();
|
|
292
|
+
cpc_sketch deserialized = cpc_sketch::deserialize(bytes.data(), bytes.size());
|
|
293
|
+
REQUIRE(deserialized.is_empty() == sketch.is_empty());
|
|
294
|
+
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
295
|
+
REQUIRE(deserialized.validate());
|
|
296
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 7), std::out_of_range);
|
|
297
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 15), std::out_of_range);
|
|
298
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
299
|
+
|
|
300
|
+
// updating again with the same values should not change the sketch
|
|
301
|
+
for (int i = 0; i < n; i++) deserialized.update(i);
|
|
302
|
+
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
303
|
+
REQUIRE(deserialized.validate());
|
|
304
|
+
}
|
|
305
|
+
|
|
286
306
|
TEST_CASE("cpc sketch: copy", "[cpc_sketch]") {
|
|
287
307
|
cpc_sketch s1(11);
|
|
288
308
|
s1.update(1);
|
|
@@ -378,4 +398,9 @@ TEST_CASE("cpc sketch: update string equivalence", "[cpc_sketch]") {
|
|
|
378
398
|
REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
|
|
379
399
|
}
|
|
380
400
|
|
|
401
|
+
TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
|
|
402
|
+
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
|
|
403
|
+
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
|
|
404
|
+
}
|
|
405
|
+
|
|
381
406
|
} /* namespace datasketches */
|
|
@@ -65,7 +65,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
|
65
65
|
void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
|
|
66
66
|
if (other.is_empty()) return;
|
|
67
67
|
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
|
68
|
-
for (auto
|
|
68
|
+
for (auto it: other.map) {
|
|
69
69
|
update(it.first, it.second);
|
|
70
70
|
}
|
|
71
71
|
offset += other.offset;
|
|
@@ -76,7 +76,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
|
76
76
|
void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
|
|
77
77
|
if (other.is_empty()) return;
|
|
78
78
|
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
|
79
|
-
for (auto
|
|
79
|
+
for (auto it: other.map) {
|
|
80
80
|
update(std::move(it.first), it.second);
|
|
81
81
|
}
|
|
82
82
|
offset += other.offset;
|
|
@@ -147,7 +147,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
|
147
147
|
typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
|
|
148
148
|
frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
|
|
149
149
|
vector_row items(map.get_allocator());
|
|
150
|
-
for (auto
|
|
150
|
+
for (auto it: map) {
|
|
151
151
|
const W lb = it.second;
|
|
152
152
|
const W ub = it.second + offset;
|
|
153
153
|
if ((err_type == NO_FALSE_NEGATIVES && ub > threshold) || (err_type == NO_FALSE_POSITIVES && lb > threshold)) {
|
|
@@ -162,28 +162,28 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
|
|
|
162
162
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
163
163
|
void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const {
|
|
164
164
|
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
|
165
|
-
|
|
165
|
+
write(os, preamble_longs);
|
|
166
166
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
167
|
-
|
|
167
|
+
write(os, serial_version);
|
|
168
168
|
const uint8_t family = FAMILY_ID;
|
|
169
|
-
|
|
169
|
+
write(os, family);
|
|
170
170
|
const uint8_t lg_max_size = map.get_lg_max_size();
|
|
171
|
-
|
|
171
|
+
write(os, lg_max_size);
|
|
172
172
|
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
|
173
|
-
|
|
173
|
+
write(os, lg_cur_size);
|
|
174
174
|
const uint8_t flags_byte(
|
|
175
175
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
176
176
|
);
|
|
177
|
-
|
|
177
|
+
write(os, flags_byte);
|
|
178
178
|
const uint16_t unused16 = 0;
|
|
179
|
-
|
|
179
|
+
write(os, unused16);
|
|
180
180
|
if (!is_empty()) {
|
|
181
181
|
const uint32_t num_items = map.get_num_active();
|
|
182
|
-
|
|
182
|
+
write(os, num_items);
|
|
183
183
|
const uint32_t unused32 = 0;
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
184
|
+
write(os, unused32);
|
|
185
|
+
write(os, total_weight);
|
|
186
|
+
write(os, offset);
|
|
187
187
|
|
|
188
188
|
// copy active items and their weights to use batch serialization
|
|
189
189
|
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
|
@@ -192,14 +192,14 @@ void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const
|
|
|
192
192
|
A alloc(map.get_allocator());
|
|
193
193
|
T* items = alloc.allocate(num_items);
|
|
194
194
|
uint32_t i = 0;
|
|
195
|
-
for (auto
|
|
195
|
+
for (auto it: map) {
|
|
196
196
|
new (&items[i]) T(it.first);
|
|
197
197
|
weights[i++] = it.second;
|
|
198
198
|
}
|
|
199
|
-
|
|
199
|
+
write(os, weights, sizeof(W) * num_items);
|
|
200
200
|
aw.deallocate(weights, num_items);
|
|
201
201
|
S().serialize(os, items, num_items);
|
|
202
|
-
for (
|
|
202
|
+
for (i = 0; i < num_items; i++) items[i].~T();
|
|
203
203
|
alloc.deallocate(items, num_items);
|
|
204
204
|
}
|
|
205
205
|
}
|
|
@@ -208,7 +208,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
|
208
208
|
size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes() const {
|
|
209
209
|
if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
|
|
210
210
|
size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
|
|
211
|
-
for (auto
|
|
211
|
+
for (auto it: map) size += S().size_of_item(it.first);
|
|
212
212
|
return size;
|
|
213
213
|
}
|
|
214
214
|
|
|
@@ -220,28 +220,26 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
|
|
|
220
220
|
uint8_t* end_ptr = ptr + size;
|
|
221
221
|
|
|
222
222
|
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
|
223
|
-
ptr += copy_to_mem(
|
|
223
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
|
224
224
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
225
|
-
ptr += copy_to_mem(
|
|
225
|
+
ptr += copy_to_mem(serial_version, ptr);
|
|
226
226
|
const uint8_t family = FAMILY_ID;
|
|
227
|
-
ptr += copy_to_mem(
|
|
227
|
+
ptr += copy_to_mem(family, ptr);
|
|
228
228
|
const uint8_t lg_max_size = map.get_lg_max_size();
|
|
229
|
-
ptr += copy_to_mem(
|
|
229
|
+
ptr += copy_to_mem(lg_max_size, ptr);
|
|
230
230
|
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
|
231
|
-
ptr += copy_to_mem(
|
|
231
|
+
ptr += copy_to_mem(lg_cur_size, ptr);
|
|
232
232
|
const uint8_t flags_byte(
|
|
233
233
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
234
234
|
);
|
|
235
|
-
ptr += copy_to_mem(
|
|
236
|
-
|
|
237
|
-
ptr += copy_to_mem(&unused16, ptr, sizeof(uint16_t));
|
|
235
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
236
|
+
ptr += sizeof(uint16_t); // unused
|
|
238
237
|
if (!is_empty()) {
|
|
239
238
|
const uint32_t num_items = map.get_num_active();
|
|
240
|
-
ptr += copy_to_mem(
|
|
241
|
-
|
|
242
|
-
ptr += copy_to_mem(
|
|
243
|
-
ptr += copy_to_mem(
|
|
244
|
-
ptr += copy_to_mem(&offset, ptr, sizeof(offset));
|
|
239
|
+
ptr += copy_to_mem(num_items, ptr);
|
|
240
|
+
ptr += sizeof(uint32_t); // unused
|
|
241
|
+
ptr += copy_to_mem(total_weight, ptr);
|
|
242
|
+
ptr += copy_to_mem(offset, ptr);
|
|
245
243
|
|
|
246
244
|
// copy active items and their weights to use batch serialization
|
|
247
245
|
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
|
@@ -250,7 +248,7 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
|
|
|
250
248
|
A alloc(map.get_allocator());
|
|
251
249
|
T* items = alloc.allocate(num_items);
|
|
252
250
|
uint32_t i = 0;
|
|
253
|
-
for (auto
|
|
251
|
+
for (auto it: map) {
|
|
254
252
|
new (&items[i]) T(it.first);
|
|
255
253
|
weights[i++] = it.second;
|
|
256
254
|
}
|
|
@@ -258,7 +256,7 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
|
|
|
258
256
|
aw.deallocate(weights, num_items);
|
|
259
257
|
const size_t bytes_remaining = end_ptr - ptr;
|
|
260
258
|
ptr += S().serialize(ptr, bytes_remaining, items, num_items);
|
|
261
|
-
for (
|
|
259
|
+
for (i = 0; i < num_items; i++) items[i].~T();
|
|
262
260
|
alloc.deallocate(items, num_items);
|
|
263
261
|
}
|
|
264
262
|
return bytes;
|
|
@@ -268,38 +266,31 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
|
268
266
|
class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
|
|
269
267
|
public:
|
|
270
268
|
items_deleter(uint32_t num, bool destroy, const A& allocator):
|
|
271
|
-
|
|
272
|
-
void set_destroy(bool destroy) {
|
|
269
|
+
allocator_(allocator), num_(num), destroy_(destroy) {}
|
|
270
|
+
void set_destroy(bool destroy) { destroy_ = destroy; }
|
|
273
271
|
void operator() (T* ptr) {
|
|
274
272
|
if (ptr != nullptr) {
|
|
275
|
-
if (
|
|
276
|
-
for (uint32_t i = 0; i <
|
|
273
|
+
if (destroy_) {
|
|
274
|
+
for (uint32_t i = 0; i < num_; ++i) ptr[i].~T();
|
|
277
275
|
}
|
|
278
|
-
|
|
276
|
+
allocator_.deallocate(ptr, num_);
|
|
279
277
|
}
|
|
280
278
|
}
|
|
281
279
|
private:
|
|
282
|
-
A
|
|
283
|
-
uint32_t
|
|
284
|
-
bool
|
|
280
|
+
A allocator_;
|
|
281
|
+
uint32_t num_;
|
|
282
|
+
bool destroy_;
|
|
285
283
|
};
|
|
286
284
|
|
|
287
285
|
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
288
286
|
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
uint8_t
|
|
292
|
-
|
|
293
|
-
uint8_t
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
is.read((char*)&lg_max_size, sizeof(lg_max_size));
|
|
297
|
-
uint8_t lg_cur_size;
|
|
298
|
-
is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
|
|
299
|
-
uint8_t flags_byte;
|
|
300
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
301
|
-
uint16_t unused16;
|
|
302
|
-
is.read((char*)&unused16, sizeof(unused16));
|
|
287
|
+
const auto preamble_longs = read<uint8_t>(is);
|
|
288
|
+
const auto serial_version = read<uint8_t>(is);
|
|
289
|
+
const auto family_id = read<uint8_t>(is);
|
|
290
|
+
const auto lg_max_size = read<uint8_t>(is);
|
|
291
|
+
const auto lg_cur_size = read<uint8_t>(is);
|
|
292
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
293
|
+
read<uint16_t>(is); // unused
|
|
303
294
|
|
|
304
295
|
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
305
296
|
|
|
@@ -310,19 +301,15 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
|
310
301
|
|
|
311
302
|
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
|
|
312
303
|
if (!is_empty) {
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
W total_weight;
|
|
318
|
-
is.read((char*)&total_weight, sizeof(total_weight));
|
|
319
|
-
W offset;
|
|
320
|
-
is.read((char*)&offset, sizeof(offset));
|
|
304
|
+
const auto num_items = read<uint32_t>(is);
|
|
305
|
+
read<uint32_t>(is); // unused
|
|
306
|
+
const auto total_weight = read<W>(is);
|
|
307
|
+
const auto offset = read<W>(is);
|
|
321
308
|
|
|
322
309
|
// batch deserialization with intermediate array of items and weights
|
|
323
310
|
using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
|
|
324
311
|
std::vector<W, AllocW> weights(num_items, 0, allocator);
|
|
325
|
-
|
|
312
|
+
read(is, weights.data(), sizeof(W) * num_items);
|
|
326
313
|
A alloc(allocator);
|
|
327
314
|
std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(num_items, false, alloc));
|
|
328
315
|
S().deserialize(is, items.get(), num_items);
|
|
@@ -344,19 +331,18 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
|
344
331
|
const char* ptr = static_cast<const char*>(bytes);
|
|
345
332
|
const char* base = static_cast<const char*>(bytes);
|
|
346
333
|
uint8_t preamble_longs;
|
|
347
|
-
ptr += copy_from_mem(ptr,
|
|
334
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
|
348
335
|
uint8_t serial_version;
|
|
349
|
-
ptr += copy_from_mem(ptr,
|
|
336
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
350
337
|
uint8_t family_id;
|
|
351
|
-
ptr += copy_from_mem(ptr,
|
|
338
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
352
339
|
uint8_t lg_max_size;
|
|
353
|
-
ptr += copy_from_mem(ptr,
|
|
340
|
+
ptr += copy_from_mem(ptr, lg_max_size);
|
|
354
341
|
uint8_t lg_cur_size;
|
|
355
|
-
ptr += copy_from_mem(ptr,
|
|
342
|
+
ptr += copy_from_mem(ptr, lg_cur_size);
|
|
356
343
|
uint8_t flags_byte;
|
|
357
|
-
ptr += copy_from_mem(ptr,
|
|
358
|
-
uint16_t
|
|
359
|
-
ptr += copy_from_mem(ptr, &unused16, sizeof(uint16_t));
|
|
344
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
345
|
+
ptr += sizeof(uint16_t); // unused
|
|
360
346
|
|
|
361
347
|
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
362
348
|
|
|
@@ -364,18 +350,17 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
|
364
350
|
check_serial_version(serial_version);
|
|
365
351
|
check_family_id(family_id);
|
|
366
352
|
check_size(lg_cur_size, lg_max_size);
|
|
367
|
-
ensure_minimum_memory(size,
|
|
353
|
+
ensure_minimum_memory(size, 1ULL << preamble_longs);
|
|
368
354
|
|
|
369
355
|
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
|
|
370
356
|
if (!is_empty) {
|
|
371
357
|
uint32_t num_items;
|
|
372
|
-
ptr += copy_from_mem(ptr,
|
|
373
|
-
uint32_t
|
|
374
|
-
ptr += copy_from_mem(ptr, &unused32, sizeof(uint32_t));
|
|
358
|
+
ptr += copy_from_mem(ptr, num_items);
|
|
359
|
+
ptr += sizeof(uint32_t); // unused
|
|
375
360
|
W total_weight;
|
|
376
|
-
ptr += copy_from_mem(ptr,
|
|
361
|
+
ptr += copy_from_mem(ptr, total_weight);
|
|
377
362
|
W offset;
|
|
378
|
-
ptr += copy_from_mem(ptr,
|
|
363
|
+
ptr += copy_from_mem(ptr, offset);
|
|
379
364
|
|
|
380
365
|
ensure_minimum_memory(size, ptr - base + (sizeof(W) * num_items));
|
|
381
366
|
// batch deserialization with intermediate array of items and weights
|
|
@@ -446,14 +431,14 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
|
|
|
446
431
|
os << "### End sketch summary" << std::endl;
|
|
447
432
|
if (print_items) {
|
|
448
433
|
vector_row items;
|
|
449
|
-
for (auto
|
|
434
|
+
for (auto it: map) {
|
|
450
435
|
items.push_back(row(&it.first, it.second, offset));
|
|
451
436
|
}
|
|
452
437
|
// sort by estimate in descending order
|
|
453
438
|
std::sort(items.begin(), items.end(), [](row a, row b){ return a.get_estimate() > b.get_estimate(); });
|
|
454
439
|
os << "### Items in descending order by estimate" << std::endl;
|
|
455
440
|
os << " item, estimate, lower bound, upper bound" << std::endl;
|
|
456
|
-
for (auto
|
|
441
|
+
for (auto it: items) {
|
|
457
442
|
os << " " << it.get_item() << ", " << it.get_estimate() << ", "
|
|
458
443
|
<< it.get_lower_bound() << ", " << it.get_upper_bound() << std::endl;
|
|
459
444
|
}
|