datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -303,7 +303,7 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions,
|
|
|
303
303
|
}
|
|
304
304
|
|
|
305
305
|
template<typename T, typename C, typename S, typename A>
|
|
306
|
-
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(
|
|
306
|
+
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
|
|
307
307
|
if (is_empty()) return std::vector<T, A>(allocator_);
|
|
308
308
|
if (num == 0) {
|
|
309
309
|
throw std::invalid_argument("num must be > 0");
|
|
@@ -380,36 +380,56 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
|
|
380
380
|
size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
|
|
381
381
|
size += S().size_of_item(*min_value_);
|
|
382
382
|
size += S().size_of_item(*max_value_);
|
|
383
|
-
for (auto
|
|
383
|
+
for (auto it: *this) size += S().size_of_item(it.first);
|
|
384
384
|
return size;
|
|
385
385
|
}
|
|
386
386
|
|
|
387
|
+
// implementation for fixed-size arithmetic types (integral and floating point)
|
|
388
|
+
template<typename T, typename C, typename S, typename A>
|
|
389
|
+
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
390
|
+
size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
|
|
391
|
+
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
|
|
392
|
+
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
|
|
393
|
+
// the last integer in the levels_ array is not serialized because it can be derived
|
|
394
|
+
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// implementation for all other types
|
|
398
|
+
template<typename T, typename C, typename S, typename A>
|
|
399
|
+
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
400
|
+
size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
|
|
401
|
+
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
|
|
402
|
+
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
|
|
403
|
+
// the last integer in the levels_ array is not serialized because it can be derived
|
|
404
|
+
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
|
|
405
|
+
}
|
|
406
|
+
|
|
387
407
|
template<typename T, typename C, typename S, typename A>
|
|
388
408
|
void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
|
|
389
409
|
const bool is_single_item = n_ == 1;
|
|
390
410
|
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
|
|
391
|
-
|
|
411
|
+
write(os, preamble_ints);
|
|
392
412
|
const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
|
|
393
|
-
|
|
413
|
+
write(os, serial_version);
|
|
394
414
|
const uint8_t family(FAMILY);
|
|
395
|
-
|
|
415
|
+
write(os, family);
|
|
396
416
|
const uint8_t flags_byte(
|
|
397
417
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
398
418
|
| (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
|
|
399
419
|
| (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
|
|
400
420
|
);
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
421
|
+
write(os, flags_byte);
|
|
422
|
+
write(os, k_);
|
|
423
|
+
write(os, m_);
|
|
404
424
|
const uint8_t unused = 0;
|
|
405
|
-
|
|
425
|
+
write(os, unused);
|
|
406
426
|
if (is_empty()) return;
|
|
407
427
|
if (!is_single_item) {
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
428
|
+
write(os, n_);
|
|
429
|
+
write(os, min_k_);
|
|
430
|
+
write(os, num_levels_);
|
|
431
|
+
write(os, unused);
|
|
432
|
+
write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
|
|
413
433
|
S().serialize(os, min_value_, 1);
|
|
414
434
|
S().serialize(os, max_value_, 1);
|
|
415
435
|
}
|
|
@@ -424,27 +444,26 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
|
|
|
424
444
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
425
445
|
const uint8_t* end_ptr = ptr + size;
|
|
426
446
|
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
|
|
427
|
-
ptr += copy_to_mem(
|
|
447
|
+
ptr += copy_to_mem(preamble_ints, ptr);
|
|
428
448
|
const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
|
|
429
|
-
ptr += copy_to_mem(
|
|
449
|
+
ptr += copy_to_mem(serial_version, ptr);
|
|
430
450
|
const uint8_t family(FAMILY);
|
|
431
|
-
ptr += copy_to_mem(
|
|
451
|
+
ptr += copy_to_mem(family, ptr);
|
|
432
452
|
const uint8_t flags_byte(
|
|
433
453
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
434
454
|
| (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
|
|
435
455
|
| (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
|
|
436
456
|
);
|
|
437
|
-
ptr += copy_to_mem(
|
|
438
|
-
ptr += copy_to_mem(
|
|
439
|
-
ptr += copy_to_mem(
|
|
440
|
-
|
|
441
|
-
ptr += copy_to_mem(&unused, ptr, sizeof(unused));
|
|
457
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
458
|
+
ptr += copy_to_mem(k_, ptr);
|
|
459
|
+
ptr += copy_to_mem(m_, ptr);
|
|
460
|
+
ptr += sizeof(uint8_t); // unused
|
|
442
461
|
if (!is_empty()) {
|
|
443
462
|
if (!is_single_item) {
|
|
444
|
-
ptr += copy_to_mem(
|
|
445
|
-
ptr += copy_to_mem(
|
|
446
|
-
ptr += copy_to_mem(
|
|
447
|
-
ptr +=
|
|
463
|
+
ptr += copy_to_mem(n_, ptr);
|
|
464
|
+
ptr += copy_to_mem(min_k_, ptr);
|
|
465
|
+
ptr += copy_to_mem(num_levels_, ptr);
|
|
466
|
+
ptr += sizeof(uint8_t); // unused
|
|
448
467
|
ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
|
|
449
468
|
ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
|
|
450
469
|
ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
|
|
@@ -459,20 +478,13 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
|
|
|
459
478
|
|
|
460
479
|
template<typename T, typename C, typename S, typename A>
|
|
461
480
|
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
uint8_t
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
uint8_t
|
|
469
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
470
|
-
uint16_t k;
|
|
471
|
-
is.read((char*)&k, sizeof(k));
|
|
472
|
-
uint8_t m;
|
|
473
|
-
is.read((char*)&m, sizeof(m));
|
|
474
|
-
uint8_t unused;
|
|
475
|
-
is.read((char*)&unused, sizeof(unused));
|
|
481
|
+
const auto preamble_ints = read<uint8_t>(is);
|
|
482
|
+
const auto serial_version = read<uint8_t>(is);
|
|
483
|
+
const auto family_id = read<uint8_t>(is);
|
|
484
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
485
|
+
const auto k = read<uint16_t>(is);
|
|
486
|
+
const auto m = read<uint8_t>(is);
|
|
487
|
+
read<uint8_t>(is); // skip unused byte
|
|
476
488
|
|
|
477
489
|
check_m(m);
|
|
478
490
|
check_preamble_ints(preamble_ints, flags_byte);
|
|
@@ -492,10 +504,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
|
|
|
492
504
|
min_k = k;
|
|
493
505
|
num_levels = 1;
|
|
494
506
|
} else {
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
507
|
+
n = read<uint64_t>(is);
|
|
508
|
+
min_k = read<uint16_t>(is);
|
|
509
|
+
num_levels = read<uint8_t>(is);
|
|
510
|
+
read<uint8_t>(is); // skip unused byte
|
|
499
511
|
}
|
|
500
512
|
vector_u32<A> levels(num_levels + 1, 0, allocator);
|
|
501
513
|
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
|
|
@@ -503,7 +515,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
|
|
|
503
515
|
levels[0] = capacity - 1;
|
|
504
516
|
} else {
|
|
505
517
|
// the last integer in levels_ is not serialized because it can be derived
|
|
506
|
-
|
|
518
|
+
read(is, levels.data(), sizeof(levels[0]) * num_levels);
|
|
507
519
|
}
|
|
508
520
|
levels[num_levels] = capacity;
|
|
509
521
|
A alloc(allocator);
|
|
@@ -546,24 +558,24 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
|
|
|
546
558
|
ensure_minimum_memory(size, 8);
|
|
547
559
|
const char* ptr = static_cast<const char*>(bytes);
|
|
548
560
|
uint8_t preamble_ints;
|
|
549
|
-
ptr += copy_from_mem(ptr,
|
|
561
|
+
ptr += copy_from_mem(ptr, preamble_ints);
|
|
550
562
|
uint8_t serial_version;
|
|
551
|
-
ptr += copy_from_mem(ptr,
|
|
563
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
552
564
|
uint8_t family_id;
|
|
553
|
-
ptr += copy_from_mem(ptr,
|
|
565
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
554
566
|
uint8_t flags_byte;
|
|
555
|
-
ptr += copy_from_mem(ptr,
|
|
567
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
556
568
|
uint16_t k;
|
|
557
|
-
ptr += copy_from_mem(ptr,
|
|
569
|
+
ptr += copy_from_mem(ptr, k);
|
|
558
570
|
uint8_t m;
|
|
559
|
-
ptr += copy_from_mem(ptr,
|
|
560
|
-
ptr
|
|
571
|
+
ptr += copy_from_mem(ptr, m);
|
|
572
|
+
ptr += sizeof(uint8_t); // skip unused byte
|
|
561
573
|
|
|
562
574
|
check_m(m);
|
|
563
575
|
check_preamble_ints(preamble_ints, flags_byte);
|
|
564
576
|
check_serial_version(serial_version);
|
|
565
577
|
check_family_id(family_id);
|
|
566
|
-
ensure_minimum_memory(size,
|
|
578
|
+
ensure_minimum_memory(size, preamble_ints * sizeof(uint32_t));
|
|
567
579
|
|
|
568
580
|
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
|
|
569
581
|
if (is_empty) return kll_sketch<T, C, S, A>(k, allocator);
|
|
@@ -578,10 +590,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
|
|
|
578
590
|
min_k = k;
|
|
579
591
|
num_levels = 1;
|
|
580
592
|
} else {
|
|
581
|
-
ptr += copy_from_mem(ptr,
|
|
582
|
-
ptr += copy_from_mem(ptr,
|
|
583
|
-
ptr += copy_from_mem(ptr,
|
|
584
|
-
ptr
|
|
593
|
+
ptr += copy_from_mem(ptr, n);
|
|
594
|
+
ptr += copy_from_mem(ptr, min_k);
|
|
595
|
+
ptr += copy_from_mem(ptr, num_levels);
|
|
596
|
+
ptr += sizeof(uint8_t); // skip unused byte
|
|
585
597
|
}
|
|
586
598
|
vector_u32<A> levels(num_levels + 1, 0, allocator);
|
|
587
599
|
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
|
|
@@ -779,7 +791,7 @@ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantil
|
|
|
779
791
|
using AllocCalc = typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>>;
|
|
780
792
|
AllocCalc alloc(allocator_);
|
|
781
793
|
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
|
|
782
|
-
new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(
|
|
794
|
+
new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(*this),
|
|
783
795
|
[&alloc](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); alloc.deallocate(ptr, 1); }
|
|
784
796
|
);
|
|
785
797
|
return quantile_calculator;
|
|
@@ -1011,7 +1023,9 @@ void kll_sketch<T, C, S, A>::check_family_id(uint8_t family_id) {
|
|
|
1011
1023
|
|
|
1012
1024
|
template <typename T, typename C, typename S, typename A>
|
|
1013
1025
|
string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
|
|
1014
|
-
|
|
1026
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
|
1027
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
|
1028
|
+
std::ostringstream os;
|
|
1015
1029
|
os << "### KLL sketch summary:" << std::endl;
|
|
1016
1030
|
os << " K : " << k_ << std::endl;
|
|
1017
1031
|
os << " min K : " << min_k_ << std::endl;
|
|
@@ -1057,7 +1071,7 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
|
|
|
1057
1071
|
}
|
|
1058
1072
|
os << "### End sketch data" << std::endl;
|
|
1059
1073
|
}
|
|
1060
|
-
return os.str();
|
|
1074
|
+
return string<A>(os.str().c_str(), allocator_);
|
|
1061
1075
|
}
|
|
1062
1076
|
|
|
1063
1077
|
template <typename T, typename C, typename S, typename A>
|
|
@@ -1067,14 +1081,14 @@ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::begin()
|
|
|
1067
1081
|
|
|
1068
1082
|
template <typename T, typename C, typename S, typename A>
|
|
1069
1083
|
typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::end() const {
|
|
1070
|
-
return kll_sketch<T, C, S, A>::const_iterator(nullptr,
|
|
1084
|
+
return kll_sketch<T, C, S, A>::const_iterator(nullptr, levels_.data(), num_levels_);
|
|
1071
1085
|
}
|
|
1072
1086
|
|
|
1073
1087
|
// kll_sketch::const_iterator implementation
|
|
1074
1088
|
|
|
1075
1089
|
template<typename T, typename C, typename S, typename A>
|
|
1076
1090
|
kll_sketch<T, C, S, A>::const_iterator::const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels):
|
|
1077
|
-
items(items), levels(levels), num_levels(num_levels), index(
|
|
1091
|
+
items(items), levels(levels), num_levels(num_levels), index(items == nullptr ? levels[num_levels] : levels[0]), level(items == nullptr ? num_levels : 0), weight(1)
|
|
1078
1092
|
{}
|
|
1079
1093
|
|
|
1080
1094
|
template<typename T, typename C, typename S, typename A>
|
|
@@ -1098,8 +1112,6 @@ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_i
|
|
|
1098
1112
|
|
|
1099
1113
|
template<typename T, typename C, typename S, typename A>
|
|
1100
1114
|
bool kll_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
|
|
1101
|
-
if (level != other.level) return false;
|
|
1102
|
-
if (level == num_levels) return true; // end
|
|
1103
1115
|
return index == other.index;
|
|
1104
1116
|
}
|
|
1105
1117
|
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef KOLMOGOROV_SMIRNOV_HPP_
|
|
21
|
+
#define KOLMOGOROV_SMIRNOV_HPP_
|
|
22
|
+
|
|
23
|
+
namespace datasketches {
|
|
24
|
+
|
|
25
|
+
class kolmogorov_smirnov {
|
|
26
|
+
public:
|
|
27
|
+
/**
|
|
28
|
+
* Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
|
|
29
|
+
* @param sketch1 KLL sketch 1
|
|
30
|
+
* @param sketch2 KLL sketch 2
|
|
31
|
+
* @return the raw delta between two KLL quantile sketches
|
|
32
|
+
*/
|
|
33
|
+
template<typename Sketch>
|
|
34
|
+
static double delta(const Sketch& sketch1, const Sketch& sketch2);
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
|
|
38
|
+
* Adjusts the computed threshold by the error epsilons of the two given sketches.
|
|
39
|
+
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
|
|
40
|
+
* @param sketch1 KLL sketch 1
|
|
41
|
+
* @param sketch2 KLL sketch 2
|
|
42
|
+
* @param p Target p-value. Typically .001 to .1, e.g., .05.
|
|
43
|
+
* @return the adjusted threshold to be compared with the raw delta
|
|
44
|
+
*/
|
|
45
|
+
template<typename Sketch>
|
|
46
|
+
static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
|
|
50
|
+
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
|
|
51
|
+
* this will return false.
|
|
52
|
+
* @param sketch1 KLL sketch 1
|
|
53
|
+
* @param sketch2 KLL sketch 2
|
|
54
|
+
* @param p Target p-value. Typically .001 to .1, e.g., .05.
|
|
55
|
+
* @return Boolean indicating whether we can reject the null hypothesis (that the sketches
|
|
56
|
+
* reflect the same underlying distribution) using the provided p-value.
|
|
57
|
+
*/
|
|
58
|
+
template<typename Sketch>
|
|
59
|
+
static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
|
|
60
|
+
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
} /* namespace datasketches */
|
|
64
|
+
|
|
65
|
+
#include "kolmogorov_smirnov_impl.hpp"
|
|
66
|
+
|
|
67
|
+
#endif
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
|
|
21
|
+
#define KOLMOGOROV_SMIRNOV_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
namespace datasketches {
|
|
24
|
+
|
|
25
|
+
// type resolver
|
|
26
|
+
template<typename T, typename C, typename S, typename A>
|
|
27
|
+
kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
|
|
28
|
+
return kll_quantile_calculator<T, C, A>(sketch);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
template<typename Sketch>
|
|
32
|
+
double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
|
|
33
|
+
using Comparator = typename Sketch::comparator;
|
|
34
|
+
auto calc1 = make_quantile_calculator(sketch1);
|
|
35
|
+
auto calc2 = make_quantile_calculator(sketch2);
|
|
36
|
+
auto it1 = calc1.begin();
|
|
37
|
+
auto it2 = calc2.begin();
|
|
38
|
+
const auto n1 = sketch1.get_n();
|
|
39
|
+
const auto n2 = sketch2.get_n();
|
|
40
|
+
double delta = 0;
|
|
41
|
+
while (it1 != calc1.end() && it2 != calc2.end()) {
|
|
42
|
+
const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
|
|
43
|
+
const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
|
|
44
|
+
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
|
|
45
|
+
if (Comparator()((*it1).first, (*it2).first)) {
|
|
46
|
+
++it1;
|
|
47
|
+
} else if (Comparator()((*it2).first, (*it1).first)) {
|
|
48
|
+
++it2;
|
|
49
|
+
} else {
|
|
50
|
+
++it1;
|
|
51
|
+
++it2;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
|
|
55
|
+
const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
|
|
56
|
+
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
|
|
57
|
+
return delta;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
template<typename Sketch>
|
|
61
|
+
double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
|
|
62
|
+
const double r1 = sketch1.get_num_retained();
|
|
63
|
+
const double r2 = sketch2.get_num_retained();
|
|
64
|
+
const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
|
|
65
|
+
const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
|
|
66
|
+
const double eps1 = sketch1.get_normalized_rank_error(false);
|
|
67
|
+
const double eps2 = sketch2.get_normalized_rank_error(false);
|
|
68
|
+
return delta_area_threshold + eps1 + eps2;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
template<typename Sketch>
|
|
72
|
+
bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
|
|
73
|
+
return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
} /* namespace datasketches */
|
|
77
|
+
|
|
78
|
+
#endif
|