datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -53,9 +53,7 @@ first_interesting_column(0),
|
|
|
53
53
|
kxp(1 << lg_k),
|
|
54
54
|
hip_est_accum(0)
|
|
55
55
|
{
|
|
56
|
-
|
|
57
|
-
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
58
|
-
}
|
|
56
|
+
check_lg_k(lg_k);
|
|
59
57
|
}
|
|
60
58
|
|
|
61
59
|
template<typename A>
|
|
@@ -176,7 +174,7 @@ void cpc_sketch_alloc<A>::update(float value) {
|
|
|
176
174
|
|
|
177
175
|
static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
|
|
178
176
|
if (lg_k > 26) throw std::logic_error("lg_k > 26");
|
|
179
|
-
const
|
|
177
|
+
const uint32_t k = 1 << lg_k;
|
|
180
178
|
uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
|
|
181
179
|
if (col > 63) col = 63; // clip so that 0 <= col <= 63
|
|
182
180
|
const uint32_t row = hash0 & (k - 1);
|
|
@@ -188,7 +186,7 @@ static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, u
|
|
|
188
186
|
}
|
|
189
187
|
|
|
190
188
|
template<typename A>
|
|
191
|
-
void cpc_sketch_alloc<A>::update(const void* value,
|
|
189
|
+
void cpc_sketch_alloc<A>::update(const void* value, size_t size) {
|
|
192
190
|
HashState hashes;
|
|
193
191
|
MurmurHash3_x64_128(value, size, seed, hashes);
|
|
194
192
|
row_col_update(row_col_from_two_hashes(hashes.h1, hashes.h2, lg_k));
|
|
@@ -208,7 +206,7 @@ void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
|
|
|
208
206
|
|
|
209
207
|
template<typename A>
|
|
210
208
|
void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
|
|
211
|
-
const
|
|
209
|
+
const uint32_t k = 1 << lg_k;
|
|
212
210
|
const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
|
|
213
211
|
if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
|
|
214
212
|
bool is_novel = surprising_value_table.maybe_insert(row_col);
|
|
@@ -224,7 +222,7 @@ void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
|
|
|
224
222
|
template<typename A>
|
|
225
223
|
void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
|
|
226
224
|
if (window_offset > 56) throw std::logic_error("wrong window offset");
|
|
227
|
-
const
|
|
225
|
+
const uint32_t k = 1 << lg_k;
|
|
228
226
|
const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
|
|
229
227
|
if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
|
|
230
228
|
const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
|
|
@@ -266,7 +264,7 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
|
|
|
266
264
|
// Call this whenever a new coupon has been collected.
|
|
267
265
|
template<typename A>
|
|
268
266
|
void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
|
|
269
|
-
const
|
|
267
|
+
const uint32_t k = 1 << lg_k;
|
|
270
268
|
const uint8_t col = row_col & 63;
|
|
271
269
|
const double one_over_p = static_cast<double>(k) / kxp;
|
|
272
270
|
hip_est_accum += one_over_p;
|
|
@@ -276,7 +274,7 @@ void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
|
|
|
276
274
|
// In terms of flavor, this promotes SPARSE to HYBRID
|
|
277
275
|
template<typename A>
|
|
278
276
|
void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
|
|
279
|
-
const
|
|
277
|
+
const uint32_t k = 1 << lg_k;
|
|
280
278
|
const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
|
|
281
279
|
if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
|
|
282
280
|
|
|
@@ -285,16 +283,16 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
|
|
|
285
283
|
u32_table<A> new_table(2, 6 + lg_k, sliding_window.get_allocator());
|
|
286
284
|
|
|
287
285
|
const uint32_t* old_slots = surprising_value_table.get_slots();
|
|
288
|
-
const
|
|
286
|
+
const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size();
|
|
289
287
|
|
|
290
288
|
if (window_offset != 0) throw std::logic_error("window_offset != 0");
|
|
291
289
|
|
|
292
|
-
for (
|
|
290
|
+
for (uint32_t i = 0; i < old_num_slots; i++) {
|
|
293
291
|
const uint32_t row_col = old_slots[i];
|
|
294
292
|
if (row_col != UINT32_MAX) {
|
|
295
293
|
const uint8_t col = row_col & 63;
|
|
296
294
|
if (col < 8) {
|
|
297
|
-
const
|
|
295
|
+
const uint32_t row = row_col >> 6;
|
|
298
296
|
sliding_window[row] |= 1 << col;
|
|
299
297
|
} else {
|
|
300
298
|
// cannot use u32_table::must_insert(), because it doesn't provide for growth
|
|
@@ -314,7 +312,7 @@ void cpc_sketch_alloc<A>::move_window() {
|
|
|
314
312
|
if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
|
|
315
313
|
|
|
316
314
|
if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
|
|
317
|
-
const
|
|
315
|
+
const uint32_t k = 1 << lg_k;
|
|
318
316
|
|
|
319
317
|
// Construct the full-sized bit matrix that corresponds to the sketch
|
|
320
318
|
vector_u64<A> bit_matrix = build_bit_matrix();
|
|
@@ -328,7 +326,7 @@ void cpc_sketch_alloc<A>::move_window() {
|
|
|
328
326
|
const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << new_offset) - 1;
|
|
329
327
|
uint64_t all_surprises_ored = 0;
|
|
330
328
|
|
|
331
|
-
for (
|
|
329
|
+
for (uint32_t i = 0; i < k; i++) {
|
|
332
330
|
uint64_t pattern = bit_matrix[i];
|
|
333
331
|
sliding_window[i] = (pattern >> new_offset) & 0xff;
|
|
334
332
|
pattern &= mask_for_clearing_window;
|
|
@@ -357,7 +355,7 @@ void cpc_sketch_alloc<A>::move_window() {
|
|
|
357
355
|
// so that it will reflect changes that were previously outside the mantissa.
|
|
358
356
|
template<typename A>
|
|
359
357
|
void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
|
|
360
|
-
const
|
|
358
|
+
const uint32_t k = 1 << lg_k;
|
|
361
359
|
|
|
362
360
|
// for improved numerical accuracy, we separately sum the bytes of the U64's
|
|
363
361
|
double byte_sums[8]; // allocating on the stack
|
|
@@ -383,7 +381,9 @@ void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
|
|
|
383
381
|
|
|
384
382
|
template<typename A>
|
|
385
383
|
string<A> cpc_sketch_alloc<A>::to_string() const {
|
|
386
|
-
|
|
384
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
|
385
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
|
386
|
+
std::ostringstream os;
|
|
387
387
|
os << "### CPC sketch summary:" << std::endl;
|
|
388
388
|
os << " lg_k : " << std::to_string(lg_k) << std::endl;
|
|
389
389
|
os << " seed hash : " << std::hex << compute_seed_hash(seed) << std::dec << std::endl;
|
|
@@ -394,14 +394,14 @@ string<A> cpc_sketch_alloc<A>::to_string() const {
|
|
|
394
394
|
os << " HIP estimate : " << hip_est_accum << std::endl;
|
|
395
395
|
os << " kxp : " << kxp << std::endl;
|
|
396
396
|
}
|
|
397
|
-
os << "
|
|
397
|
+
os << " interesting col: " << std::to_string(first_interesting_column) << std::endl;
|
|
398
398
|
os << " table entries : " << surprising_value_table.get_num_items() << std::endl;
|
|
399
399
|
os << " window : " << (sliding_window.size() == 0 ? "not " : "") << "allocated" << std::endl;
|
|
400
400
|
if (sliding_window.size() > 0) {
|
|
401
401
|
os << " window offset : " << std::to_string(window_offset) << std::endl;
|
|
402
402
|
}
|
|
403
403
|
os << "### End sketch summary" << std::endl;
|
|
404
|
-
return os.str();
|
|
404
|
+
return string<A>(os.str().c_str(), sliding_window.get_allocator());
|
|
405
405
|
}
|
|
406
406
|
|
|
407
407
|
template<typename A>
|
|
@@ -415,44 +415,44 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
|
415
415
|
const bool has_table = compressed.table_data.size() > 0;
|
|
416
416
|
const bool has_window = compressed.window_data.size() > 0;
|
|
417
417
|
const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
|
|
418
|
-
|
|
418
|
+
write(os, preamble_ints);
|
|
419
419
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
420
|
-
|
|
420
|
+
write(os, serial_version);
|
|
421
421
|
const uint8_t family = FAMILY;
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
422
|
+
write(os, family);
|
|
423
|
+
write(os, lg_k);
|
|
424
|
+
write(os, first_interesting_column);
|
|
425
425
|
const uint8_t flags_byte(
|
|
426
426
|
(1 << flags::IS_COMPRESSED)
|
|
427
427
|
| (has_hip ? 1 << flags::HAS_HIP : 0)
|
|
428
428
|
| (has_table ? 1 << flags::HAS_TABLE : 0)
|
|
429
429
|
| (has_window ? 1 << flags::HAS_WINDOW : 0)
|
|
430
430
|
);
|
|
431
|
-
|
|
431
|
+
write(os, flags_byte);
|
|
432
432
|
const uint16_t seed_hash(compute_seed_hash(seed));
|
|
433
|
-
|
|
433
|
+
write(os, seed_hash);
|
|
434
434
|
if (!is_empty()) {
|
|
435
|
-
|
|
435
|
+
write(os, num_coupons);
|
|
436
436
|
if (has_table && has_window) {
|
|
437
437
|
// if there is no window it is the same as number of coupons
|
|
438
|
-
|
|
438
|
+
write(os, compressed.table_num_entries);
|
|
439
439
|
// HIP values can be in two different places in the sequence of fields
|
|
440
440
|
// this is the first HIP decision point
|
|
441
441
|
if (has_hip) write_hip(os);
|
|
442
442
|
}
|
|
443
443
|
if (has_table) {
|
|
444
|
-
|
|
444
|
+
write(os, compressed.table_data_words);
|
|
445
445
|
}
|
|
446
446
|
if (has_window) {
|
|
447
|
-
|
|
447
|
+
write(os, compressed.window_data_words);
|
|
448
448
|
}
|
|
449
449
|
// this is the second HIP decision point
|
|
450
450
|
if (has_hip && !(has_table && has_window)) write_hip(os);
|
|
451
451
|
if (has_window) {
|
|
452
|
-
|
|
452
|
+
write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
|
|
453
453
|
}
|
|
454
454
|
if (has_table) {
|
|
455
|
-
|
|
455
|
+
write(os, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
|
|
456
456
|
}
|
|
457
457
|
}
|
|
458
458
|
}
|
|
@@ -471,36 +471,36 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
|
|
471
471
|
const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
|
|
472
472
|
vector_u8<A> bytes(size, 0, sliding_window.get_allocator());
|
|
473
473
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
474
|
-
ptr += copy_to_mem(
|
|
474
|
+
ptr += copy_to_mem(preamble_ints, ptr);
|
|
475
475
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
476
|
-
ptr += copy_to_mem(
|
|
476
|
+
ptr += copy_to_mem(serial_version, ptr);
|
|
477
477
|
const uint8_t family = FAMILY;
|
|
478
|
-
ptr += copy_to_mem(
|
|
479
|
-
ptr += copy_to_mem(
|
|
480
|
-
ptr += copy_to_mem(
|
|
478
|
+
ptr += copy_to_mem(family, ptr);
|
|
479
|
+
ptr += copy_to_mem(lg_k, ptr);
|
|
480
|
+
ptr += copy_to_mem(first_interesting_column, ptr);
|
|
481
481
|
const uint8_t flags_byte(
|
|
482
482
|
(1 << flags::IS_COMPRESSED)
|
|
483
483
|
| (has_hip ? 1 << flags::HAS_HIP : 0)
|
|
484
484
|
| (has_table ? 1 << flags::HAS_TABLE : 0)
|
|
485
485
|
| (has_window ? 1 << flags::HAS_WINDOW : 0)
|
|
486
486
|
);
|
|
487
|
-
ptr += copy_to_mem(
|
|
487
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
488
488
|
const uint16_t seed_hash = compute_seed_hash(seed);
|
|
489
|
-
ptr += copy_to_mem(
|
|
489
|
+
ptr += copy_to_mem(seed_hash, ptr);
|
|
490
490
|
if (!is_empty()) {
|
|
491
|
-
ptr += copy_to_mem(
|
|
491
|
+
ptr += copy_to_mem(num_coupons, ptr);
|
|
492
492
|
if (has_table && has_window) {
|
|
493
493
|
// if there is no window it is the same as number of coupons
|
|
494
|
-
ptr += copy_to_mem(
|
|
494
|
+
ptr += copy_to_mem(compressed.table_num_entries, ptr);
|
|
495
495
|
// HIP values can be in two different places in the sequence of fields
|
|
496
496
|
// this is the first HIP decision point
|
|
497
497
|
if (has_hip) ptr += copy_hip_to_mem(ptr);
|
|
498
498
|
}
|
|
499
499
|
if (has_table) {
|
|
500
|
-
ptr += copy_to_mem(
|
|
500
|
+
ptr += copy_to_mem(compressed.table_data_words, ptr);
|
|
501
501
|
}
|
|
502
502
|
if (has_window) {
|
|
503
|
-
ptr += copy_to_mem(
|
|
503
|
+
ptr += copy_to_mem(compressed.window_data_words, ptr);
|
|
504
504
|
}
|
|
505
505
|
// this is the second HIP decision point
|
|
506
506
|
if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
|
|
@@ -517,20 +517,13 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
|
|
517
517
|
|
|
518
518
|
template<typename A>
|
|
519
519
|
cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
uint8_t
|
|
523
|
-
|
|
524
|
-
uint8_t
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
is.read((char*)&lg_k, sizeof(lg_k));
|
|
528
|
-
uint8_t first_interesting_column;
|
|
529
|
-
is.read((char*)&first_interesting_column, sizeof(first_interesting_column));
|
|
530
|
-
uint8_t flags_byte;
|
|
531
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
532
|
-
uint16_t seed_hash;
|
|
533
|
-
is.read((char*)&seed_hash, sizeof(seed_hash));
|
|
520
|
+
const auto preamble_ints = read<uint8_t>(is);
|
|
521
|
+
const auto serial_version = read<uint8_t>(is);
|
|
522
|
+
const auto family_id = read<uint8_t>(is);
|
|
523
|
+
const auto lg_k = read<uint8_t>(is);
|
|
524
|
+
const auto first_interesting_column = read<uint8_t>(is);
|
|
525
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
526
|
+
const auto seed_hash = read<uint16_t>(is);
|
|
534
527
|
const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
|
|
535
528
|
const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
|
|
536
529
|
const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
|
|
@@ -542,31 +535,31 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
|
|
|
542
535
|
double kxp = 0;
|
|
543
536
|
double hip_est_accum = 0;
|
|
544
537
|
if (has_table || has_window) {
|
|
545
|
-
|
|
538
|
+
num_coupons = read<uint32_t>(is);
|
|
546
539
|
if (has_table && has_window) {
|
|
547
|
-
|
|
540
|
+
compressed.table_num_entries = read<uint32_t>(is);
|
|
548
541
|
if (has_hip) {
|
|
549
|
-
|
|
550
|
-
|
|
542
|
+
kxp = read<double>(is);
|
|
543
|
+
hip_est_accum = read<double>(is);
|
|
551
544
|
}
|
|
552
545
|
}
|
|
553
546
|
if (has_table) {
|
|
554
|
-
|
|
547
|
+
compressed.table_data_words = read<uint32_t>(is);
|
|
555
548
|
}
|
|
556
549
|
if (has_window) {
|
|
557
|
-
|
|
550
|
+
compressed.window_data_words = read<uint32_t>(is);
|
|
558
551
|
}
|
|
559
552
|
if (has_hip && !(has_table && has_window)) {
|
|
560
|
-
|
|
561
|
-
|
|
553
|
+
kxp = read<double>(is);
|
|
554
|
+
hip_est_accum = read<double>(is);
|
|
562
555
|
}
|
|
563
556
|
if (has_window) {
|
|
564
557
|
compressed.window_data.resize(compressed.window_data_words);
|
|
565
|
-
|
|
558
|
+
read(is, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
|
|
566
559
|
}
|
|
567
560
|
if (has_table) {
|
|
568
561
|
compressed.table_data.resize(compressed.table_data_words);
|
|
569
|
-
|
|
562
|
+
read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
|
|
570
563
|
}
|
|
571
564
|
if (!has_window) compressed.table_num_entries = num_coupons;
|
|
572
565
|
}
|
|
@@ -602,19 +595,19 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
|
|
|
602
595
|
const char* ptr = static_cast<const char*>(bytes);
|
|
603
596
|
const char* base = static_cast<const char*>(bytes);
|
|
604
597
|
uint8_t preamble_ints;
|
|
605
|
-
ptr += copy_from_mem(ptr,
|
|
598
|
+
ptr += copy_from_mem(ptr, preamble_ints);
|
|
606
599
|
uint8_t serial_version;
|
|
607
|
-
ptr += copy_from_mem(ptr,
|
|
600
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
608
601
|
uint8_t family_id;
|
|
609
|
-
ptr += copy_from_mem(ptr,
|
|
602
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
610
603
|
uint8_t lg_k;
|
|
611
|
-
ptr += copy_from_mem(ptr,
|
|
604
|
+
ptr += copy_from_mem(ptr, lg_k);
|
|
612
605
|
uint8_t first_interesting_column;
|
|
613
|
-
ptr += copy_from_mem(ptr,
|
|
606
|
+
ptr += copy_from_mem(ptr, first_interesting_column);
|
|
614
607
|
uint8_t flags_byte;
|
|
615
|
-
ptr += copy_from_mem(ptr,
|
|
608
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
616
609
|
uint16_t seed_hash;
|
|
617
|
-
ptr += copy_from_mem(ptr,
|
|
610
|
+
ptr += copy_from_mem(ptr, seed_hash);
|
|
618
611
|
const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
|
|
619
612
|
const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
|
|
620
613
|
const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
|
|
@@ -628,28 +621,28 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
|
|
|
628
621
|
double hip_est_accum = 0;
|
|
629
622
|
if (has_table || has_window) {
|
|
630
623
|
check_memory_size(ptr - base + sizeof(num_coupons), size);
|
|
631
|
-
ptr += copy_from_mem(ptr,
|
|
624
|
+
ptr += copy_from_mem(ptr, num_coupons);
|
|
632
625
|
if (has_table && has_window) {
|
|
633
626
|
check_memory_size(ptr - base + sizeof(compressed.table_num_entries), size);
|
|
634
|
-
ptr += copy_from_mem(ptr,
|
|
627
|
+
ptr += copy_from_mem(ptr, compressed.table_num_entries);
|
|
635
628
|
if (has_hip) {
|
|
636
629
|
check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
|
|
637
|
-
ptr += copy_from_mem(ptr,
|
|
638
|
-
ptr += copy_from_mem(ptr,
|
|
630
|
+
ptr += copy_from_mem(ptr, kxp);
|
|
631
|
+
ptr += copy_from_mem(ptr, hip_est_accum);
|
|
639
632
|
}
|
|
640
633
|
}
|
|
641
634
|
if (has_table) {
|
|
642
635
|
check_memory_size(ptr - base + sizeof(compressed.table_data_words), size);
|
|
643
|
-
ptr += copy_from_mem(ptr,
|
|
636
|
+
ptr += copy_from_mem(ptr, compressed.table_data_words);
|
|
644
637
|
}
|
|
645
638
|
if (has_window) {
|
|
646
639
|
check_memory_size(ptr - base + sizeof(compressed.window_data_words), size);
|
|
647
|
-
ptr += copy_from_mem(ptr,
|
|
640
|
+
ptr += copy_from_mem(ptr, compressed.window_data_words);
|
|
648
641
|
}
|
|
649
642
|
if (has_hip && !(has_table && has_window)) {
|
|
650
643
|
check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
|
|
651
|
-
ptr += copy_from_mem(ptr,
|
|
652
|
-
ptr += copy_from_mem(ptr,
|
|
644
|
+
ptr += copy_from_mem(ptr, kxp);
|
|
645
|
+
ptr += copy_from_mem(ptr, hip_est_accum);
|
|
653
646
|
}
|
|
654
647
|
if (has_window) {
|
|
655
648
|
compressed.window_data.resize(compressed.window_data_words);
|
|
@@ -688,6 +681,49 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
|
|
|
688
681
|
std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
|
|
689
682
|
}
|
|
690
683
|
|
|
684
|
+
/*
|
|
685
|
+
* These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
|
|
686
|
+
* trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
|
|
687
|
+
* spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
|
|
688
|
+
* worst-case space for the preamble, which is added by the function.
|
|
689
|
+
*/
|
|
690
|
+
static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
|
|
691
|
+
static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[] = {
|
|
692
|
+
24, // lg_k = 4
|
|
693
|
+
36, // lg_k = 5
|
|
694
|
+
56, // lg_k = 6
|
|
695
|
+
100, // lg_k = 7
|
|
696
|
+
180, // lg_k = 8
|
|
697
|
+
344, // lg_k = 9
|
|
698
|
+
660, // lg_k = 10
|
|
699
|
+
1292, // lg_k = 11
|
|
700
|
+
2540, // lg_k = 12
|
|
701
|
+
5020, // lg_k = 13
|
|
702
|
+
9968, // lg_k = 14
|
|
703
|
+
19836, // lg_k = 15
|
|
704
|
+
39532, // lg_k = 16
|
|
705
|
+
78880, // lg_k = 17
|
|
706
|
+
157516, // lg_k = 18
|
|
707
|
+
314656 // lg_k = 19
|
|
708
|
+
};
|
|
709
|
+
static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
|
|
710
|
+
static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
|
|
711
|
+
|
|
712
|
+
template<typename A>
|
|
713
|
+
size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
|
|
714
|
+
check_lg_k(lg_k);
|
|
715
|
+
if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
|
|
716
|
+
const uint32_t k = 1 << lg_k;
|
|
717
|
+
return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
template<typename A>
|
|
721
|
+
void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
|
|
722
|
+
if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
|
|
723
|
+
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
|
|
691
727
|
template<typename A>
|
|
692
728
|
uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
|
|
693
729
|
return num_coupons;
|
|
@@ -696,7 +732,7 @@ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
|
|
|
696
732
|
template<typename A>
|
|
697
733
|
bool cpc_sketch_alloc<A>::validate() const {
|
|
698
734
|
vector_u64<A> bit_matrix = build_bit_matrix();
|
|
699
|
-
const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(),
|
|
735
|
+
const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1ULL << lg_k);
|
|
700
736
|
return num_bits_set == num_coupons;
|
|
701
737
|
}
|
|
702
738
|
|
|
@@ -744,7 +780,7 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor() con
|
|
|
744
780
|
|
|
745
781
|
template<typename A>
|
|
746
782
|
typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8_t lg_k, uint64_t c) {
|
|
747
|
-
const
|
|
783
|
+
const uint32_t k = 1 << lg_k;
|
|
748
784
|
const uint64_t c2 = c << 1;
|
|
749
785
|
const uint64_t c8 = c << 3;
|
|
750
786
|
const uint64_t c32 = c << 5;
|
|
@@ -757,15 +793,15 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8
|
|
|
757
793
|
|
|
758
794
|
template<typename A>
|
|
759
795
|
uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c) {
|
|
760
|
-
const
|
|
796
|
+
const uint32_t k = 1 << lg_k;
|
|
761
797
|
const int64_t tmp = static_cast<int64_t>(c << 3) - static_cast<int64_t>(19 * k); // 8C - 19K
|
|
762
798
|
if (tmp < 0) return 0;
|
|
763
|
-
return tmp >> (lg_k + 3); // tmp / 8K
|
|
799
|
+
return static_cast<uint8_t>(tmp >> (lg_k + 3)); // tmp / 8K
|
|
764
800
|
}
|
|
765
801
|
|
|
766
802
|
template<typename A>
|
|
767
803
|
vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
|
|
768
|
-
const
|
|
804
|
+
const uint32_t k = 1 << lg_k;
|
|
769
805
|
if (window_offset > 56) throw std::logic_error("offset > 56");
|
|
770
806
|
|
|
771
807
|
// Fill the matrix with default rows in which the "early zone" is filled with ones.
|
|
@@ -782,12 +818,12 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
|
|
|
782
818
|
}
|
|
783
819
|
|
|
784
820
|
const uint32_t* slots = surprising_value_table.get_slots();
|
|
785
|
-
const
|
|
821
|
+
const uint32_t num_slots = 1 << surprising_value_table.get_lg_size();
|
|
786
822
|
for (size_t i = 0; i < num_slots; i++) {
|
|
787
823
|
const uint32_t row_col = slots[i];
|
|
788
824
|
if (row_col != UINT32_MAX) {
|
|
789
825
|
const uint8_t col = row_col & 63;
|
|
790
|
-
const
|
|
826
|
+
const uint32_t row = row_col >> 6;
|
|
791
827
|
// Flip the specified matrix bit from its default value.
|
|
792
828
|
// In the "early" zone the bit changes from 1 to 0.
|
|
793
829
|
// In the "late" zone the bit changes from 0 to 1.
|
|
@@ -799,8 +835,8 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
|
|
|
799
835
|
|
|
800
836
|
template<typename A>
|
|
801
837
|
void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
|
|
802
|
-
|
|
803
|
-
|
|
838
|
+
write(os, kxp);
|
|
839
|
+
write(os, hip_est_accum);
|
|
804
840
|
}
|
|
805
841
|
|
|
806
842
|
template<typename A>
|
|
@@ -45,7 +45,7 @@ public:
|
|
|
45
45
|
* @param lg_k base 2 logarithm of the number of bins in the sketch
|
|
46
46
|
* @param seed for hash function
|
|
47
47
|
*/
|
|
48
|
-
explicit cpc_union_alloc(uint8_t lg_k =
|
|
48
|
+
explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
49
49
|
|
|
50
50
|
cpc_union_alloc(const cpc_union_alloc<A>& other);
|
|
51
51
|
cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
|
|
@@ -34,7 +34,7 @@ bit_matrix(allocator)
|
|
|
34
34
|
if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
|
|
35
35
|
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
36
36
|
}
|
|
37
|
-
accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
|
|
37
|
+
accumulator = new (AllocCpc(allocator).allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
template<typename A>
|
|
@@ -45,7 +45,7 @@ accumulator(other.accumulator),
|
|
|
45
45
|
bit_matrix(other.bit_matrix)
|
|
46
46
|
{
|
|
47
47
|
if (accumulator != nullptr) {
|
|
48
|
-
accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
|
|
48
|
+
accumulator = new (AllocCpc(accumulator->get_allocator()).allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
|
|
@@ -62,8 +62,9 @@ bit_matrix(std::move(other.bit_matrix))
|
|
|
62
62
|
template<typename A>
|
|
63
63
|
cpc_union_alloc<A>::~cpc_union_alloc() {
|
|
64
64
|
if (accumulator != nullptr) {
|
|
65
|
+
AllocCpc allocator(accumulator->get_allocator());
|
|
65
66
|
accumulator->~cpc_sketch_alloc<A>();
|
|
66
|
-
|
|
67
|
+
allocator.deallocate(accumulator, 1);
|
|
67
68
|
}
|
|
68
69
|
}
|
|
69
70
|
|
|
@@ -181,7 +182,7 @@ template<typename A>
|
|
|
181
182
|
cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
|
|
182
183
|
if (lg_k != accumulator->get_lg_k()) throw std::logic_error("lg_k != accumulator->lg_k");
|
|
183
184
|
if (accumulator->get_num_coupons() == 0) {
|
|
184
|
-
return cpc_sketch_alloc<A>(lg_k, seed);
|
|
185
|
+
return cpc_sketch_alloc<A>(lg_k, seed, accumulator->get_allocator());
|
|
185
186
|
}
|
|
186
187
|
if (accumulator->determine_flavor() != cpc_sketch_alloc<A>::flavor::SPARSE) throw std::logic_error("wrong flavor");
|
|
187
188
|
cpc_sketch_alloc<A> copy(*accumulator);
|
|
@@ -191,8 +192,8 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
|
|
|
191
192
|
|
|
192
193
|
template<typename A>
|
|
193
194
|
cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
|
|
194
|
-
const
|
|
195
|
-
const
|
|
195
|
+
const uint32_t k = 1 << lg_k;
|
|
196
|
+
const uint32_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
|
|
196
197
|
|
|
197
198
|
const auto flavor = cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons);
|
|
198
199
|
if (flavor != cpc_sketch_alloc<A>::flavor::HYBRID && flavor != cpc_sketch_alloc<A>::flavor::PINNED
|
|
@@ -215,7 +216,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
|
|
|
215
216
|
|
|
216
217
|
// The snowplow effect was caused by processing the rows in order,
|
|
217
218
|
// but we have fixed it by using a sufficiently large hash table.
|
|
218
|
-
for (
|
|
219
|
+
for (uint32_t i = 0; i < k; i++) {
|
|
219
220
|
uint64_t pattern = bit_matrix[i];
|
|
220
221
|
sliding_window[i] = (pattern >> offset) & 0xff;
|
|
221
222
|
pattern &= mask_for_clearing_window;
|
|
@@ -242,25 +243,26 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
|
|
|
242
243
|
template<typename A>
|
|
243
244
|
void cpc_union_alloc<A>::switch_to_bit_matrix() {
|
|
244
245
|
bit_matrix = accumulator->build_bit_matrix();
|
|
246
|
+
AllocCpc allocator(accumulator->get_allocator());
|
|
245
247
|
accumulator->~cpc_sketch_alloc<A>();
|
|
246
|
-
|
|
248
|
+
allocator.deallocate(accumulator, 1);
|
|
247
249
|
accumulator = nullptr;
|
|
248
250
|
}
|
|
249
251
|
|
|
250
252
|
template<typename A>
|
|
251
253
|
void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
|
|
252
254
|
const uint32_t* slots = table.get_slots();
|
|
253
|
-
const
|
|
255
|
+
const uint32_t num_slots = 1 << table.get_lg_size();
|
|
254
256
|
const uint64_t dst_mask = (((1 << accumulator->get_lg_k()) - 1) << 6) | 63; // downsamples when dst lgK < src LgK
|
|
255
257
|
|
|
256
258
|
// Using a golden ratio stride fixes the snowplow effect.
|
|
257
259
|
const double golden = 0.6180339887498949025;
|
|
258
|
-
|
|
260
|
+
uint32_t stride = static_cast<uint32_t>(golden * static_cast<double>(num_slots));
|
|
259
261
|
if (stride < 2) throw std::logic_error("stride < 2");
|
|
260
262
|
if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd
|
|
261
263
|
if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range");
|
|
262
264
|
|
|
263
|
-
for (
|
|
265
|
+
for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) {
|
|
264
266
|
j &= num_slots - 1;
|
|
265
267
|
const uint32_t row_col = slots[j];
|
|
266
268
|
if (row_col != UINT32_MAX) {
|
|
@@ -272,13 +274,13 @@ void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
|
|
|
272
274
|
template<typename A>
|
|
273
275
|
void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
|
|
274
276
|
const uint32_t* slots = table.get_slots();
|
|
275
|
-
const
|
|
277
|
+
const uint32_t num_slots = 1 << table.get_lg_size();
|
|
276
278
|
const uint64_t dest_mask = (1 << lg_k) - 1; // downsamples when dst lgK < sr LgK
|
|
277
|
-
for (
|
|
279
|
+
for (uint32_t i = 0; i < num_slots; i++) {
|
|
278
280
|
const uint32_t row_col = slots[i];
|
|
279
281
|
if (row_col != UINT32_MAX) {
|
|
280
282
|
const uint8_t col = row_col & 63;
|
|
281
|
-
const
|
|
283
|
+
const uint32_t row = row_col >> 6;
|
|
282
284
|
bit_matrix[row & dest_mask] |= static_cast<uint64_t>(1) << col; // set the bit
|
|
283
285
|
}
|
|
284
286
|
}
|
|
@@ -288,8 +290,8 @@ template<typename A>
|
|
|
288
290
|
void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k) {
|
|
289
291
|
if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
|
|
290
292
|
const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
|
|
291
|
-
const
|
|
292
|
-
for (
|
|
293
|
+
const uint32_t src_k = 1 << src_lg_k;
|
|
294
|
+
for (uint32_t src_row = 0; src_row < src_k; src_row++) {
|
|
293
295
|
bit_matrix[src_row & dst_mask] |= static_cast<uint64_t>(sliding_window[src_row]) << offset;
|
|
294
296
|
}
|
|
295
297
|
}
|
|
@@ -298,8 +300,8 @@ template<typename A>
|
|
|
298
300
|
void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k) {
|
|
299
301
|
if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
|
|
300
302
|
const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
|
|
301
|
-
const
|
|
302
|
-
for (
|
|
303
|
+
const uint32_t src_k = 1 << src_lg_k;
|
|
304
|
+
for (uint32_t src_row = 0; src_row < src_k; src_row++) {
|
|
303
305
|
bit_matrix[src_row & dst_mask] |= src_matrix[src_row];
|
|
304
306
|
}
|
|
305
307
|
}
|
|
@@ -313,7 +315,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
|
|
|
313
315
|
if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
|
|
314
316
|
vector_u64<A> old_matrix = std::move(bit_matrix);
|
|
315
317
|
const uint8_t old_lg_k = lg_k;
|
|
316
|
-
const
|
|
318
|
+
const uint32_t new_k = 1 << new_lg_k;
|
|
317
319
|
bit_matrix = vector_u64<A>(new_k, 0, old_matrix.get_allocator());
|
|
318
320
|
lg_k = new_lg_k;
|
|
319
321
|
or_matrix_into_matrix(old_matrix, old_lg_k);
|
|
@@ -324,7 +326,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
|
|
|
324
326
|
if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected");
|
|
325
327
|
if (!accumulator->is_empty()) {
|
|
326
328
|
cpc_sketch_alloc<A> old_accumulator(*accumulator);
|
|
327
|
-
*accumulator = cpc_sketch_alloc<A>(new_lg_k, seed);
|
|
329
|
+
*accumulator = cpc_sketch_alloc<A>(new_lg_k, seed, old_accumulator.get_allocator());
|
|
328
330
|
walk_table_updating_sketch(old_accumulator.surprising_value_table);
|
|
329
331
|
}
|
|
330
332
|
lg_k = new_lg_k;
|