datasketches 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
|
@@ -192,7 +192,7 @@ public:
|
|
|
192
192
|
* @param data pointer to the data
|
|
193
193
|
* @param length of the data in bytes
|
|
194
194
|
*/
|
|
195
|
-
void update(const void* value,
|
|
195
|
+
void update(const void* value, size_t size);
|
|
196
196
|
|
|
197
197
|
/**
|
|
198
198
|
* Returns a human-readable summary of this sketch
|
|
@@ -235,6 +235,17 @@ public:
|
|
|
235
235
|
*/
|
|
236
236
|
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
237
237
|
|
|
238
|
+
/**
|
|
239
|
+
* The actual size of a compressed CPC sketch has a small random variance, but the following
|
|
240
|
+
* empirically measured size should be large enough for at least 99.9 percent of sketches.
|
|
241
|
+
*
|
|
242
|
+
* <p>For small values of <i>n</i> the size can be much smaller.
|
|
243
|
+
*
|
|
244
|
+
* @param lg_k the given value of lg_k.
|
|
245
|
+
* @return the estimated maximum compressed serialized size of a sketch.
|
|
246
|
+
*/
|
|
247
|
+
static size_t get_max_serialized_size_bytes(uint8_t lg_k);
|
|
248
|
+
|
|
238
249
|
// for internal use
|
|
239
250
|
uint32_t get_num_coupons() const;
|
|
240
251
|
|
|
@@ -303,6 +314,8 @@ private:
|
|
|
303
314
|
inline void write_hip(std::ostream& os) const;
|
|
304
315
|
inline size_t copy_hip_to_mem(void* dst) const;
|
|
305
316
|
|
|
317
|
+
static void check_lg_k(uint8_t lg_k);
|
|
318
|
+
|
|
306
319
|
friend cpc_compressor<A>;
|
|
307
320
|
friend cpc_union_alloc<A>;
|
|
308
321
|
};
|
|
@@ -53,9 +53,7 @@ first_interesting_column(0),
|
|
|
53
53
|
kxp(1 << lg_k),
|
|
54
54
|
hip_est_accum(0)
|
|
55
55
|
{
|
|
56
|
-
|
|
57
|
-
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
58
|
-
}
|
|
56
|
+
check_lg_k(lg_k);
|
|
59
57
|
}
|
|
60
58
|
|
|
61
59
|
template<typename A>
|
|
@@ -176,7 +174,7 @@ void cpc_sketch_alloc<A>::update(float value) {
|
|
|
176
174
|
|
|
177
175
|
static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
|
|
178
176
|
if (lg_k > 26) throw std::logic_error("lg_k > 26");
|
|
179
|
-
const
|
|
177
|
+
const uint32_t k = 1 << lg_k;
|
|
180
178
|
uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
|
|
181
179
|
if (col > 63) col = 63; // clip so that 0 <= col <= 63
|
|
182
180
|
const uint32_t row = hash0 & (k - 1);
|
|
@@ -188,7 +186,7 @@ static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, u
|
|
|
188
186
|
}
|
|
189
187
|
|
|
190
188
|
template<typename A>
|
|
191
|
-
void cpc_sketch_alloc<A>::update(const void* value,
|
|
189
|
+
void cpc_sketch_alloc<A>::update(const void* value, size_t size) {
|
|
192
190
|
HashState hashes;
|
|
193
191
|
MurmurHash3_x64_128(value, size, seed, hashes);
|
|
194
192
|
row_col_update(row_col_from_two_hashes(hashes.h1, hashes.h2, lg_k));
|
|
@@ -208,7 +206,7 @@ void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
|
|
|
208
206
|
|
|
209
207
|
template<typename A>
|
|
210
208
|
void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
|
|
211
|
-
const
|
|
209
|
+
const uint32_t k = 1 << lg_k;
|
|
212
210
|
const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
|
|
213
211
|
if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
|
|
214
212
|
bool is_novel = surprising_value_table.maybe_insert(row_col);
|
|
@@ -224,7 +222,7 @@ void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
|
|
|
224
222
|
template<typename A>
|
|
225
223
|
void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
|
|
226
224
|
if (window_offset > 56) throw std::logic_error("wrong window offset");
|
|
227
|
-
const
|
|
225
|
+
const uint32_t k = 1 << lg_k;
|
|
228
226
|
const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
|
|
229
227
|
if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
|
|
230
228
|
const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
|
|
@@ -266,7 +264,7 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
|
|
|
266
264
|
// Call this whenever a new coupon has been collected.
|
|
267
265
|
template<typename A>
|
|
268
266
|
void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
|
|
269
|
-
const
|
|
267
|
+
const uint32_t k = 1 << lg_k;
|
|
270
268
|
const uint8_t col = row_col & 63;
|
|
271
269
|
const double one_over_p = static_cast<double>(k) / kxp;
|
|
272
270
|
hip_est_accum += one_over_p;
|
|
@@ -276,7 +274,7 @@ void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
|
|
|
276
274
|
// In terms of flavor, this promotes SPARSE to HYBRID
|
|
277
275
|
template<typename A>
|
|
278
276
|
void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
|
|
279
|
-
const
|
|
277
|
+
const uint32_t k = 1 << lg_k;
|
|
280
278
|
const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
|
|
281
279
|
if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
|
|
282
280
|
|
|
@@ -285,16 +283,16 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
|
|
|
285
283
|
u32_table<A> new_table(2, 6 + lg_k, sliding_window.get_allocator());
|
|
286
284
|
|
|
287
285
|
const uint32_t* old_slots = surprising_value_table.get_slots();
|
|
288
|
-
const
|
|
286
|
+
const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size();
|
|
289
287
|
|
|
290
288
|
if (window_offset != 0) throw std::logic_error("window_offset != 0");
|
|
291
289
|
|
|
292
|
-
for (
|
|
290
|
+
for (uint32_t i = 0; i < old_num_slots; i++) {
|
|
293
291
|
const uint32_t row_col = old_slots[i];
|
|
294
292
|
if (row_col != UINT32_MAX) {
|
|
295
293
|
const uint8_t col = row_col & 63;
|
|
296
294
|
if (col < 8) {
|
|
297
|
-
const
|
|
295
|
+
const uint32_t row = row_col >> 6;
|
|
298
296
|
sliding_window[row] |= 1 << col;
|
|
299
297
|
} else {
|
|
300
298
|
// cannot use u32_table::must_insert(), because it doesn't provide for growth
|
|
@@ -314,7 +312,7 @@ void cpc_sketch_alloc<A>::move_window() {
|
|
|
314
312
|
if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
|
|
315
313
|
|
|
316
314
|
if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
|
|
317
|
-
const
|
|
315
|
+
const uint32_t k = 1 << lg_k;
|
|
318
316
|
|
|
319
317
|
// Construct the full-sized bit matrix that corresponds to the sketch
|
|
320
318
|
vector_u64<A> bit_matrix = build_bit_matrix();
|
|
@@ -328,7 +326,7 @@ void cpc_sketch_alloc<A>::move_window() {
|
|
|
328
326
|
const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << new_offset) - 1;
|
|
329
327
|
uint64_t all_surprises_ored = 0;
|
|
330
328
|
|
|
331
|
-
for (
|
|
329
|
+
for (uint32_t i = 0; i < k; i++) {
|
|
332
330
|
uint64_t pattern = bit_matrix[i];
|
|
333
331
|
sliding_window[i] = (pattern >> new_offset) & 0xff;
|
|
334
332
|
pattern &= mask_for_clearing_window;
|
|
@@ -357,7 +355,7 @@ void cpc_sketch_alloc<A>::move_window() {
|
|
|
357
355
|
// so that it will reflect changes that were previously outside the mantissa.
|
|
358
356
|
template<typename A>
|
|
359
357
|
void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
|
|
360
|
-
const
|
|
358
|
+
const uint32_t k = 1 << lg_k;
|
|
361
359
|
|
|
362
360
|
// for improved numerical accuracy, we separately sum the bytes of the U64's
|
|
363
361
|
double byte_sums[8]; // allocating on the stack
|
|
@@ -415,44 +413,44 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
|
415
413
|
const bool has_table = compressed.table_data.size() > 0;
|
|
416
414
|
const bool has_window = compressed.window_data.size() > 0;
|
|
417
415
|
const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
|
|
418
|
-
|
|
416
|
+
write(os, preamble_ints);
|
|
419
417
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
420
|
-
|
|
418
|
+
write(os, serial_version);
|
|
421
419
|
const uint8_t family = FAMILY;
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
420
|
+
write(os, family);
|
|
421
|
+
write(os, lg_k);
|
|
422
|
+
write(os, first_interesting_column);
|
|
425
423
|
const uint8_t flags_byte(
|
|
426
424
|
(1 << flags::IS_COMPRESSED)
|
|
427
425
|
| (has_hip ? 1 << flags::HAS_HIP : 0)
|
|
428
426
|
| (has_table ? 1 << flags::HAS_TABLE : 0)
|
|
429
427
|
| (has_window ? 1 << flags::HAS_WINDOW : 0)
|
|
430
428
|
);
|
|
431
|
-
|
|
429
|
+
write(os, flags_byte);
|
|
432
430
|
const uint16_t seed_hash(compute_seed_hash(seed));
|
|
433
|
-
|
|
431
|
+
write(os, seed_hash);
|
|
434
432
|
if (!is_empty()) {
|
|
435
|
-
|
|
433
|
+
write(os, num_coupons);
|
|
436
434
|
if (has_table && has_window) {
|
|
437
435
|
// if there is no window it is the same as number of coupons
|
|
438
|
-
|
|
436
|
+
write(os, compressed.table_num_entries);
|
|
439
437
|
// HIP values can be in two different places in the sequence of fields
|
|
440
438
|
// this is the first HIP decision point
|
|
441
439
|
if (has_hip) write_hip(os);
|
|
442
440
|
}
|
|
443
441
|
if (has_table) {
|
|
444
|
-
|
|
442
|
+
write(os, compressed.table_data_words);
|
|
445
443
|
}
|
|
446
444
|
if (has_window) {
|
|
447
|
-
|
|
445
|
+
write(os, compressed.window_data_words);
|
|
448
446
|
}
|
|
449
447
|
// this is the second HIP decision point
|
|
450
448
|
if (has_hip && !(has_table && has_window)) write_hip(os);
|
|
451
449
|
if (has_window) {
|
|
452
|
-
|
|
450
|
+
write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
|
|
453
451
|
}
|
|
454
452
|
if (has_table) {
|
|
455
|
-
|
|
453
|
+
write(os, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
|
|
456
454
|
}
|
|
457
455
|
}
|
|
458
456
|
}
|
|
@@ -471,36 +469,36 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
|
|
471
469
|
const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
|
|
472
470
|
vector_u8<A> bytes(size, 0, sliding_window.get_allocator());
|
|
473
471
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
474
|
-
ptr += copy_to_mem(
|
|
472
|
+
ptr += copy_to_mem(preamble_ints, ptr);
|
|
475
473
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
476
|
-
ptr += copy_to_mem(
|
|
474
|
+
ptr += copy_to_mem(serial_version, ptr);
|
|
477
475
|
const uint8_t family = FAMILY;
|
|
478
|
-
ptr += copy_to_mem(
|
|
479
|
-
ptr += copy_to_mem(
|
|
480
|
-
ptr += copy_to_mem(
|
|
476
|
+
ptr += copy_to_mem(family, ptr);
|
|
477
|
+
ptr += copy_to_mem(lg_k, ptr);
|
|
478
|
+
ptr += copy_to_mem(first_interesting_column, ptr);
|
|
481
479
|
const uint8_t flags_byte(
|
|
482
480
|
(1 << flags::IS_COMPRESSED)
|
|
483
481
|
| (has_hip ? 1 << flags::HAS_HIP : 0)
|
|
484
482
|
| (has_table ? 1 << flags::HAS_TABLE : 0)
|
|
485
483
|
| (has_window ? 1 << flags::HAS_WINDOW : 0)
|
|
486
484
|
);
|
|
487
|
-
ptr += copy_to_mem(
|
|
485
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
488
486
|
const uint16_t seed_hash = compute_seed_hash(seed);
|
|
489
|
-
ptr += copy_to_mem(
|
|
487
|
+
ptr += copy_to_mem(seed_hash, ptr);
|
|
490
488
|
if (!is_empty()) {
|
|
491
|
-
ptr += copy_to_mem(
|
|
489
|
+
ptr += copy_to_mem(num_coupons, ptr);
|
|
492
490
|
if (has_table && has_window) {
|
|
493
491
|
// if there is no window it is the same as number of coupons
|
|
494
|
-
ptr += copy_to_mem(
|
|
492
|
+
ptr += copy_to_mem(compressed.table_num_entries, ptr);
|
|
495
493
|
// HIP values can be in two different places in the sequence of fields
|
|
496
494
|
// this is the first HIP decision point
|
|
497
495
|
if (has_hip) ptr += copy_hip_to_mem(ptr);
|
|
498
496
|
}
|
|
499
497
|
if (has_table) {
|
|
500
|
-
ptr += copy_to_mem(
|
|
498
|
+
ptr += copy_to_mem(compressed.table_data_words, ptr);
|
|
501
499
|
}
|
|
502
500
|
if (has_window) {
|
|
503
|
-
ptr += copy_to_mem(
|
|
501
|
+
ptr += copy_to_mem(compressed.window_data_words, ptr);
|
|
504
502
|
}
|
|
505
503
|
// this is the second HIP decision point
|
|
506
504
|
if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
|
|
@@ -517,20 +515,13 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
|
|
517
515
|
|
|
518
516
|
template<typename A>
|
|
519
517
|
cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
uint8_t
|
|
523
|
-
|
|
524
|
-
uint8_t
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
is.read((char*)&lg_k, sizeof(lg_k));
|
|
528
|
-
uint8_t first_interesting_column;
|
|
529
|
-
is.read((char*)&first_interesting_column, sizeof(first_interesting_column));
|
|
530
|
-
uint8_t flags_byte;
|
|
531
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
532
|
-
uint16_t seed_hash;
|
|
533
|
-
is.read((char*)&seed_hash, sizeof(seed_hash));
|
|
518
|
+
const auto preamble_ints = read<uint8_t>(is);
|
|
519
|
+
const auto serial_version = read<uint8_t>(is);
|
|
520
|
+
const auto family_id = read<uint8_t>(is);
|
|
521
|
+
const auto lg_k = read<uint8_t>(is);
|
|
522
|
+
const auto first_interesting_column = read<uint8_t>(is);
|
|
523
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
524
|
+
const auto seed_hash = read<uint16_t>(is);
|
|
534
525
|
const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
|
|
535
526
|
const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
|
|
536
527
|
const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
|
|
@@ -542,31 +533,31 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
|
|
|
542
533
|
double kxp = 0;
|
|
543
534
|
double hip_est_accum = 0;
|
|
544
535
|
if (has_table || has_window) {
|
|
545
|
-
|
|
536
|
+
num_coupons = read<uint32_t>(is);
|
|
546
537
|
if (has_table && has_window) {
|
|
547
|
-
|
|
538
|
+
compressed.table_num_entries = read<uint32_t>(is);
|
|
548
539
|
if (has_hip) {
|
|
549
|
-
|
|
550
|
-
|
|
540
|
+
kxp = read<double>(is);
|
|
541
|
+
hip_est_accum = read<double>(is);
|
|
551
542
|
}
|
|
552
543
|
}
|
|
553
544
|
if (has_table) {
|
|
554
|
-
|
|
545
|
+
compressed.table_data_words = read<uint32_t>(is);
|
|
555
546
|
}
|
|
556
547
|
if (has_window) {
|
|
557
|
-
|
|
548
|
+
compressed.window_data_words = read<uint32_t>(is);
|
|
558
549
|
}
|
|
559
550
|
if (has_hip && !(has_table && has_window)) {
|
|
560
|
-
|
|
561
|
-
|
|
551
|
+
kxp = read<double>(is);
|
|
552
|
+
hip_est_accum = read<double>(is);
|
|
562
553
|
}
|
|
563
554
|
if (has_window) {
|
|
564
555
|
compressed.window_data.resize(compressed.window_data_words);
|
|
565
|
-
|
|
556
|
+
read(is, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
|
|
566
557
|
}
|
|
567
558
|
if (has_table) {
|
|
568
559
|
compressed.table_data.resize(compressed.table_data_words);
|
|
569
|
-
|
|
560
|
+
read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
|
|
570
561
|
}
|
|
571
562
|
if (!has_window) compressed.table_num_entries = num_coupons;
|
|
572
563
|
}
|
|
@@ -602,19 +593,19 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
|
|
|
602
593
|
const char* ptr = static_cast<const char*>(bytes);
|
|
603
594
|
const char* base = static_cast<const char*>(bytes);
|
|
604
595
|
uint8_t preamble_ints;
|
|
605
|
-
ptr += copy_from_mem(ptr,
|
|
596
|
+
ptr += copy_from_mem(ptr, preamble_ints);
|
|
606
597
|
uint8_t serial_version;
|
|
607
|
-
ptr += copy_from_mem(ptr,
|
|
598
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
608
599
|
uint8_t family_id;
|
|
609
|
-
ptr += copy_from_mem(ptr,
|
|
600
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
610
601
|
uint8_t lg_k;
|
|
611
|
-
ptr += copy_from_mem(ptr,
|
|
602
|
+
ptr += copy_from_mem(ptr, lg_k);
|
|
612
603
|
uint8_t first_interesting_column;
|
|
613
|
-
ptr += copy_from_mem(ptr,
|
|
604
|
+
ptr += copy_from_mem(ptr, first_interesting_column);
|
|
614
605
|
uint8_t flags_byte;
|
|
615
|
-
ptr += copy_from_mem(ptr,
|
|
606
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
616
607
|
uint16_t seed_hash;
|
|
617
|
-
ptr += copy_from_mem(ptr,
|
|
608
|
+
ptr += copy_from_mem(ptr, seed_hash);
|
|
618
609
|
const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
|
|
619
610
|
const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
|
|
620
611
|
const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
|
|
@@ -628,28 +619,28 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
|
|
|
628
619
|
double hip_est_accum = 0;
|
|
629
620
|
if (has_table || has_window) {
|
|
630
621
|
check_memory_size(ptr - base + sizeof(num_coupons), size);
|
|
631
|
-
ptr += copy_from_mem(ptr,
|
|
622
|
+
ptr += copy_from_mem(ptr, num_coupons);
|
|
632
623
|
if (has_table && has_window) {
|
|
633
624
|
check_memory_size(ptr - base + sizeof(compressed.table_num_entries), size);
|
|
634
|
-
ptr += copy_from_mem(ptr,
|
|
625
|
+
ptr += copy_from_mem(ptr, compressed.table_num_entries);
|
|
635
626
|
if (has_hip) {
|
|
636
627
|
check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
|
|
637
|
-
ptr += copy_from_mem(ptr,
|
|
638
|
-
ptr += copy_from_mem(ptr,
|
|
628
|
+
ptr += copy_from_mem(ptr, kxp);
|
|
629
|
+
ptr += copy_from_mem(ptr, hip_est_accum);
|
|
639
630
|
}
|
|
640
631
|
}
|
|
641
632
|
if (has_table) {
|
|
642
633
|
check_memory_size(ptr - base + sizeof(compressed.table_data_words), size);
|
|
643
|
-
ptr += copy_from_mem(ptr,
|
|
634
|
+
ptr += copy_from_mem(ptr, compressed.table_data_words);
|
|
644
635
|
}
|
|
645
636
|
if (has_window) {
|
|
646
637
|
check_memory_size(ptr - base + sizeof(compressed.window_data_words), size);
|
|
647
|
-
ptr += copy_from_mem(ptr,
|
|
638
|
+
ptr += copy_from_mem(ptr, compressed.window_data_words);
|
|
648
639
|
}
|
|
649
640
|
if (has_hip && !(has_table && has_window)) {
|
|
650
641
|
check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
|
|
651
|
-
ptr += copy_from_mem(ptr,
|
|
652
|
-
ptr += copy_from_mem(ptr,
|
|
642
|
+
ptr += copy_from_mem(ptr, kxp);
|
|
643
|
+
ptr += copy_from_mem(ptr, hip_est_accum);
|
|
653
644
|
}
|
|
654
645
|
if (has_window) {
|
|
655
646
|
compressed.window_data.resize(compressed.window_data_words);
|
|
@@ -688,6 +679,49 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
|
|
|
688
679
|
std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
|
|
689
680
|
}
|
|
690
681
|
|
|
682
|
+
/*
|
|
683
|
+
* These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
|
|
684
|
+
* trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
|
|
685
|
+
* spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
|
|
686
|
+
* worst-case space for the preamble, which is added by the function.
|
|
687
|
+
*/
|
|
688
|
+
static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
|
|
689
|
+
static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[] = {
|
|
690
|
+
24, // lg_k = 4
|
|
691
|
+
36, // lg_k = 5
|
|
692
|
+
56, // lg_k = 6
|
|
693
|
+
100, // lg_k = 7
|
|
694
|
+
180, // lg_k = 8
|
|
695
|
+
344, // lg_k = 9
|
|
696
|
+
660, // lg_k = 10
|
|
697
|
+
1292, // lg_k = 11
|
|
698
|
+
2540, // lg_k = 12
|
|
699
|
+
5020, // lg_k = 13
|
|
700
|
+
9968, // lg_k = 14
|
|
701
|
+
19836, // lg_k = 15
|
|
702
|
+
39532, // lg_k = 16
|
|
703
|
+
78880, // lg_k = 17
|
|
704
|
+
157516, // lg_k = 18
|
|
705
|
+
314656 // lg_k = 19
|
|
706
|
+
};
|
|
707
|
+
static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
|
|
708
|
+
static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
|
|
709
|
+
|
|
710
|
+
template<typename A>
|
|
711
|
+
size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
|
|
712
|
+
check_lg_k(lg_k);
|
|
713
|
+
if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
|
|
714
|
+
const uint32_t k = 1 << lg_k;
|
|
715
|
+
return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
template<typename A>
|
|
719
|
+
void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
|
|
720
|
+
if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
|
|
721
|
+
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
|
|
691
725
|
template<typename A>
|
|
692
726
|
uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
|
|
693
727
|
return num_coupons;
|
|
@@ -696,7 +730,7 @@ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
|
|
|
696
730
|
template<typename A>
|
|
697
731
|
bool cpc_sketch_alloc<A>::validate() const {
|
|
698
732
|
vector_u64<A> bit_matrix = build_bit_matrix();
|
|
699
|
-
const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(),
|
|
733
|
+
const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1ULL << lg_k);
|
|
700
734
|
return num_bits_set == num_coupons;
|
|
701
735
|
}
|
|
702
736
|
|
|
@@ -744,7 +778,7 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor() con
|
|
|
744
778
|
|
|
745
779
|
template<typename A>
|
|
746
780
|
typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8_t lg_k, uint64_t c) {
|
|
747
|
-
const
|
|
781
|
+
const uint32_t k = 1 << lg_k;
|
|
748
782
|
const uint64_t c2 = c << 1;
|
|
749
783
|
const uint64_t c8 = c << 3;
|
|
750
784
|
const uint64_t c32 = c << 5;
|
|
@@ -757,15 +791,15 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8
|
|
|
757
791
|
|
|
758
792
|
template<typename A>
|
|
759
793
|
uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c) {
|
|
760
|
-
const
|
|
794
|
+
const uint32_t k = 1 << lg_k;
|
|
761
795
|
const int64_t tmp = static_cast<int64_t>(c << 3) - static_cast<int64_t>(19 * k); // 8C - 19K
|
|
762
796
|
if (tmp < 0) return 0;
|
|
763
|
-
return tmp >> (lg_k + 3); // tmp / 8K
|
|
797
|
+
return static_cast<uint8_t>(tmp >> (lg_k + 3)); // tmp / 8K
|
|
764
798
|
}
|
|
765
799
|
|
|
766
800
|
template<typename A>
|
|
767
801
|
vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
|
|
768
|
-
const
|
|
802
|
+
const uint32_t k = 1 << lg_k;
|
|
769
803
|
if (window_offset > 56) throw std::logic_error("offset > 56");
|
|
770
804
|
|
|
771
805
|
// Fill the matrix with default rows in which the "early zone" is filled with ones.
|
|
@@ -782,12 +816,12 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
|
|
|
782
816
|
}
|
|
783
817
|
|
|
784
818
|
const uint32_t* slots = surprising_value_table.get_slots();
|
|
785
|
-
const
|
|
819
|
+
const uint32_t num_slots = 1 << surprising_value_table.get_lg_size();
|
|
786
820
|
for (size_t i = 0; i < num_slots; i++) {
|
|
787
821
|
const uint32_t row_col = slots[i];
|
|
788
822
|
if (row_col != UINT32_MAX) {
|
|
789
823
|
const uint8_t col = row_col & 63;
|
|
790
|
-
const
|
|
824
|
+
const uint32_t row = row_col >> 6;
|
|
791
825
|
// Flip the specified matrix bit from its default value.
|
|
792
826
|
// In the "early" zone the bit changes from 1 to 0.
|
|
793
827
|
// In the "late" zone the bit changes from 0 to 1.
|
|
@@ -799,8 +833,8 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
|
|
|
799
833
|
|
|
800
834
|
template<typename A>
|
|
801
835
|
void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
|
|
802
|
-
|
|
803
|
-
|
|
836
|
+
write(os, kxp);
|
|
837
|
+
write(os, hip_est_accum);
|
|
804
838
|
}
|
|
805
839
|
|
|
806
840
|
template<typename A>
|