datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -303,7 +303,7 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions,
303
303
  }
304
304
 
305
305
  template<typename T, typename C, typename S, typename A>
306
- std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(size_t num) const {
306
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
307
307
  if (is_empty()) return std::vector<T, A>(allocator_);
308
308
  if (num == 0) {
309
309
  throw std::invalid_argument("num must be > 0");
@@ -380,36 +380,56 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
380
380
  size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
381
381
  size += S().size_of_item(*min_value_);
382
382
  size += S().size_of_item(*max_value_);
383
- for (auto& it: *this) size += S().size_of_item(it.first);
383
+ for (auto it: *this) size += S().size_of_item(it.first);
384
384
  return size;
385
385
  }
386
386
 
387
+ // implementation for fixed-size arithmetic types (integral and floating point)
388
+ template<typename T, typename C, typename S, typename A>
389
+ template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
390
+ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
391
+ const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
392
+ const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
393
+ // the last integer in the levels_ array is not serialized because it can be derived
394
+ return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
395
+ }
396
+
397
+ // implementation for all other types
398
+ template<typename T, typename C, typename S, typename A>
399
+ template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
400
+ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
401
+ const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
402
+ const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
403
+ // the last integer in the levels_ array is not serialized because it can be derived
404
+ return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
405
+ }
406
+
387
407
  template<typename T, typename C, typename S, typename A>
388
408
  void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
389
409
  const bool is_single_item = n_ == 1;
390
410
  const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
391
- os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
411
+ write(os, preamble_ints);
392
412
  const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
393
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
413
+ write(os, serial_version);
394
414
  const uint8_t family(FAMILY);
395
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
415
+ write(os, family);
396
416
  const uint8_t flags_byte(
397
417
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
398
418
  | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
399
419
  | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
400
420
  );
401
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
402
- os.write((char*)&k_, sizeof(k_));
403
- os.write((char*)&m_, sizeof(m_));
421
+ write(os, flags_byte);
422
+ write(os, k_);
423
+ write(os, m_);
404
424
  const uint8_t unused = 0;
405
- os.write(reinterpret_cast<const char*>(&unused), sizeof(unused));
425
+ write(os, unused);
406
426
  if (is_empty()) return;
407
427
  if (!is_single_item) {
408
- os.write((char*)&n_, sizeof(n_));
409
- os.write((char*)&min_k_, sizeof(min_k_));
410
- os.write((char*)&num_levels_, sizeof(num_levels_));
411
- os.write((char*)&unused, sizeof(unused));
412
- os.write((char*)levels_.data(), sizeof(levels_[0]) * num_levels_);
428
+ write(os, n_);
429
+ write(os, min_k_);
430
+ write(os, num_levels_);
431
+ write(os, unused);
432
+ write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
413
433
  S().serialize(os, min_value_, 1);
414
434
  S().serialize(os, max_value_, 1);
415
435
  }
@@ -424,27 +444,26 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
424
444
  uint8_t* ptr = bytes.data() + header_size_bytes;
425
445
  const uint8_t* end_ptr = ptr + size;
426
446
  const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
427
- ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
447
+ ptr += copy_to_mem(preamble_ints, ptr);
428
448
  const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
429
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
449
+ ptr += copy_to_mem(serial_version, ptr);
430
450
  const uint8_t family(FAMILY);
431
- ptr += copy_to_mem(&family, ptr, sizeof(family));
451
+ ptr += copy_to_mem(family, ptr);
432
452
  const uint8_t flags_byte(
433
453
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
434
454
  | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
435
455
  | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
436
456
  );
437
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
438
- ptr += copy_to_mem(&k_, ptr, sizeof(k_));
439
- ptr += copy_to_mem(&m_, ptr, sizeof(m_));
440
- const uint8_t unused = 0;
441
- ptr += copy_to_mem(&unused, ptr, sizeof(unused));
457
+ ptr += copy_to_mem(flags_byte, ptr);
458
+ ptr += copy_to_mem(k_, ptr);
459
+ ptr += copy_to_mem(m_, ptr);
460
+ ptr += sizeof(uint8_t); // unused
442
461
  if (!is_empty()) {
443
462
  if (!is_single_item) {
444
- ptr += copy_to_mem(&n_, ptr, sizeof(n_));
445
- ptr += copy_to_mem(&min_k_, ptr, sizeof(min_k_));
446
- ptr += copy_to_mem(&num_levels_, ptr, sizeof(num_levels_));
447
- ptr += copy_to_mem(&unused, ptr, sizeof(unused));
463
+ ptr += copy_to_mem(n_, ptr);
464
+ ptr += copy_to_mem(min_k_, ptr);
465
+ ptr += copy_to_mem(num_levels_, ptr);
466
+ ptr += sizeof(uint8_t); // unused
448
467
  ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
449
468
  ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
450
469
  ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
@@ -459,20 +478,13 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
459
478
 
460
479
  template<typename T, typename C, typename S, typename A>
461
480
  kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
462
- uint8_t preamble_ints;
463
- is.read((char*)&preamble_ints, sizeof(preamble_ints));
464
- uint8_t serial_version;
465
- is.read((char*)&serial_version, sizeof(serial_version));
466
- uint8_t family_id;
467
- is.read((char*)&family_id, sizeof(family_id));
468
- uint8_t flags_byte;
469
- is.read((char*)&flags_byte, sizeof(flags_byte));
470
- uint16_t k;
471
- is.read((char*)&k, sizeof(k));
472
- uint8_t m;
473
- is.read((char*)&m, sizeof(m));
474
- uint8_t unused;
475
- is.read((char*)&unused, sizeof(unused));
481
+ const auto preamble_ints = read<uint8_t>(is);
482
+ const auto serial_version = read<uint8_t>(is);
483
+ const auto family_id = read<uint8_t>(is);
484
+ const auto flags_byte = read<uint8_t>(is);
485
+ const auto k = read<uint16_t>(is);
486
+ const auto m = read<uint8_t>(is);
487
+ read<uint8_t>(is); // skip unused byte
476
488
 
477
489
  check_m(m);
478
490
  check_preamble_ints(preamble_ints, flags_byte);
@@ -492,10 +504,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
492
504
  min_k = k;
493
505
  num_levels = 1;
494
506
  } else {
495
- is.read((char*)&n, sizeof(n_));
496
- is.read((char*)&min_k, sizeof(min_k_));
497
- is.read((char*)&num_levels, sizeof(num_levels));
498
- is.read((char*)&unused, sizeof(unused));
507
+ n = read<uint64_t>(is);
508
+ min_k = read<uint16_t>(is);
509
+ num_levels = read<uint8_t>(is);
510
+ read<uint8_t>(is); // skip unused byte
499
511
  }
500
512
  vector_u32<A> levels(num_levels + 1, 0, allocator);
501
513
  const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
@@ -503,7 +515,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
503
515
  levels[0] = capacity - 1;
504
516
  } else {
505
517
  // the last integer in levels_ is not serialized because it can be derived
506
- is.read((char*)levels.data(), sizeof(levels[0]) * num_levels);
518
+ read(is, levels.data(), sizeof(levels[0]) * num_levels);
507
519
  }
508
520
  levels[num_levels] = capacity;
509
521
  A alloc(allocator);
@@ -546,24 +558,24 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
546
558
  ensure_minimum_memory(size, 8);
547
559
  const char* ptr = static_cast<const char*>(bytes);
548
560
  uint8_t preamble_ints;
549
- ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
561
+ ptr += copy_from_mem(ptr, preamble_ints);
550
562
  uint8_t serial_version;
551
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
563
+ ptr += copy_from_mem(ptr, serial_version);
552
564
  uint8_t family_id;
553
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
565
+ ptr += copy_from_mem(ptr, family_id);
554
566
  uint8_t flags_byte;
555
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
567
+ ptr += copy_from_mem(ptr, flags_byte);
556
568
  uint16_t k;
557
- ptr += copy_from_mem(ptr, &k, sizeof(k));
569
+ ptr += copy_from_mem(ptr, k);
558
570
  uint8_t m;
559
- ptr += copy_from_mem(ptr, &m, sizeof(m));
560
- ptr++; // skip unused byte
571
+ ptr += copy_from_mem(ptr, m);
572
+ ptr += sizeof(uint8_t); // skip unused byte
561
573
 
562
574
  check_m(m);
563
575
  check_preamble_ints(preamble_ints, flags_byte);
564
576
  check_serial_version(serial_version);
565
577
  check_family_id(family_id);
566
- ensure_minimum_memory(size, 1 << preamble_ints);
578
+ ensure_minimum_memory(size, preamble_ints * sizeof(uint32_t));
567
579
 
568
580
  const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
569
581
  if (is_empty) return kll_sketch<T, C, S, A>(k, allocator);
@@ -578,10 +590,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
578
590
  min_k = k;
579
591
  num_levels = 1;
580
592
  } else {
581
- ptr += copy_from_mem(ptr, &n, sizeof(n));
582
- ptr += copy_from_mem(ptr, &min_k, sizeof(min_k));
583
- ptr += copy_from_mem(ptr, &num_levels, sizeof(num_levels));
584
- ptr++; // skip unused byte
593
+ ptr += copy_from_mem(ptr, n);
594
+ ptr += copy_from_mem(ptr, min_k);
595
+ ptr += copy_from_mem(ptr, num_levels);
596
+ ptr += sizeof(uint8_t); // skip unused byte
585
597
  }
586
598
  vector_u32<A> levels(num_levels + 1, 0, allocator);
587
599
  const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
@@ -779,7 +791,7 @@ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantil
779
791
  using AllocCalc = typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>>;
780
792
  AllocCalc alloc(allocator_);
781
793
  std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
782
- new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(items_, levels_.data(), num_levels_, n_, allocator_),
794
+ new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(*this),
783
795
  [&alloc](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); alloc.deallocate(ptr, 1); }
784
796
  );
785
797
  return quantile_calculator;
@@ -1011,7 +1023,9 @@ void kll_sketch<T, C, S, A>::check_family_id(uint8_t family_id) {
1011
1023
 
1012
1024
  template <typename T, typename C, typename S, typename A>
1013
1025
  string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
1014
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
1026
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
1027
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
1028
+ std::ostringstream os;
1015
1029
  os << "### KLL sketch summary:" << std::endl;
1016
1030
  os << " K : " << k_ << std::endl;
1017
1031
  os << " min K : " << min_k_ << std::endl;
@@ -1057,7 +1071,7 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
1057
1071
  }
1058
1072
  os << "### End sketch data" << std::endl;
1059
1073
  }
1060
- return os.str();
1074
+ return string<A>(os.str().c_str(), allocator_);
1061
1075
  }
1062
1076
 
1063
1077
  template <typename T, typename C, typename S, typename A>
@@ -1067,14 +1081,14 @@ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::begin()
1067
1081
 
1068
1082
  template <typename T, typename C, typename S, typename A>
1069
1083
  typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::end() const {
1070
- return kll_sketch<T, C, S, A>::const_iterator(nullptr, nullptr, num_levels_);
1084
+ return kll_sketch<T, C, S, A>::const_iterator(nullptr, levels_.data(), num_levels_);
1071
1085
  }
1072
1086
 
1073
1087
  // kll_sketch::const_iterator implementation
1074
1088
 
1075
1089
  template<typename T, typename C, typename S, typename A>
1076
1090
  kll_sketch<T, C, S, A>::const_iterator::const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels):
1077
- items(items), levels(levels), num_levels(num_levels), index(levels == nullptr ? 0 : levels[0]), level(levels == nullptr ? num_levels : 0), weight(1)
1091
+ items(items), levels(levels), num_levels(num_levels), index(items == nullptr ? levels[num_levels] : levels[0]), level(items == nullptr ? num_levels : 0), weight(1)
1078
1092
  {}
1079
1093
 
1080
1094
  template<typename T, typename C, typename S, typename A>
@@ -1098,8 +1112,6 @@ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_i
1098
1112
 
1099
1113
  template<typename T, typename C, typename S, typename A>
1100
1114
  bool kll_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
1101
- if (level != other.level) return false;
1102
- if (level == num_levels) return true; // end
1103
1115
  return index == other.index;
1104
1116
  }
1105
1117
 
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KOLMOGOROV_SMIRNOV_HPP_
21
+ #define KOLMOGOROV_SMIRNOV_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ class kolmogorov_smirnov {
26
+ public:
27
+ /**
28
+ * Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
29
+ * @param sketch1 KLL sketch 1
30
+ * @param sketch2 KLL sketch 2
31
+ * @return the raw delta between two KLL quantile sketches
32
+ */
33
+ template<typename Sketch>
34
+ static double delta(const Sketch& sketch1, const Sketch& sketch2);
35
+
36
+ /**
37
+ * Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
38
+ * Adjusts the computed threshold by the error epsilons of the two given sketches.
39
+ * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
40
+ * @param sketch1 KLL sketch 1
41
+ * @param sketch2 KLL sketch 2
42
+ * @param p Target p-value. Typically .001 to .1, e.g., .05.
43
+ * @return the adjusted threshold to be compared with the raw delta
44
+ */
45
+ template<typename Sketch>
46
+ static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
47
+
48
+ /**
49
+ * Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
50
+ * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
51
+ * this will return false.
52
+ * @param sketch1 KLL sketch 1
53
+ * @param sketch2 KLL sketch 2
54
+ * @param p Target p-value. Typically .001 to .1, e.g., .05.
55
+ * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
56
+ * reflect the same underlying distribution) using the provided p-value.
57
+ */
58
+ template<typename Sketch>
59
+ static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
60
+
61
+ };
62
+
63
+ } /* namespace datasketches */
64
+
65
+ #include "kolmogorov_smirnov_impl.hpp"
66
+
67
+ #endif
@@ -0,0 +1,78 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
21
+ #define KOLMOGOROV_SMIRNOV_IMPL_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ // type resolver
26
+ template<typename T, typename C, typename S, typename A>
27
+ kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
28
+ return kll_quantile_calculator<T, C, A>(sketch);
29
+ }
30
+
31
+ template<typename Sketch>
32
+ double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
33
+ using Comparator = typename Sketch::comparator;
34
+ auto calc1 = make_quantile_calculator(sketch1);
35
+ auto calc2 = make_quantile_calculator(sketch2);
36
+ auto it1 = calc1.begin();
37
+ auto it2 = calc2.begin();
38
+ const auto n1 = sketch1.get_n();
39
+ const auto n2 = sketch2.get_n();
40
+ double delta = 0;
41
+ while (it1 != calc1.end() && it2 != calc2.end()) {
42
+ const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
43
+ const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
44
+ delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
45
+ if (Comparator()((*it1).first, (*it2).first)) {
46
+ ++it1;
47
+ } else if (Comparator()((*it2).first, (*it1).first)) {
48
+ ++it2;
49
+ } else {
50
+ ++it1;
51
+ ++it2;
52
+ }
53
+ }
54
+ const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
55
+ const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
56
+ delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
57
+ return delta;
58
+ }
59
+
60
+ template<typename Sketch>
61
+ double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
62
+ const double r1 = sketch1.get_num_retained();
63
+ const double r2 = sketch2.get_num_retained();
64
+ const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
65
+ const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
66
+ const double eps1 = sketch1.get_normalized_rank_error(false);
67
+ const double eps2 = sketch2.get_normalized_rank_error(false);
68
+ return delta_area_threshold + eps1 + eps2;
69
+ }
70
+
71
+ template<typename Sketch>
72
+ bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
73
+ return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
74
+ }
75
+
76
+ } /* namespace datasketches */
77
+
78
+ #endif
@@ -41,4 +41,5 @@ target_sources(kll_test
41
41
  kll_sketch_test.cpp
42
42
  kll_sketch_custom_type_test.cpp
43
43
  kll_sketch_validation.cpp
44
+ kolmogorov_smirnov_test.cpp
44
45
  )