datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -31,9 +31,9 @@ static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
31
31
  else return quotient + 1;
32
32
  }
33
33
 
34
- static inline uint64_t long_floor_log2_of_long(uint64_t x) {
35
- if (x < 1) throw std::invalid_argument("long_floor_log2_of_long: bad argument");
36
- uint64_t p = 0;
34
+ static inline uint8_t floor_log2_of_long(uint64_t x) {
35
+ if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument");
36
+ uint8_t p = 0;
37
37
  uint64_t y = 1;
38
38
  while (true) {
39
39
  if (y == x) return p;
@@ -69,7 +69,7 @@ static inline uint64_t wegner_count_bits_set_in_matrix(const uint64_t* array, si
69
69
  // Note: this is an adaptation of the Java code,
70
70
  // which is apparently a variation of Figure 5-2 in "Hacker's Delight"
71
71
  // by Henry S. Warren.
72
- static inline uint64_t warren_bit_count(uint64_t i) {
72
+ static inline uint32_t warren_bit_count(uint64_t i) {
73
73
  i = i - ((i >> 1) & 0x5555555555555555ULL);
74
74
  i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
75
75
  i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
@@ -79,9 +79,9 @@ static inline uint64_t warren_bit_count(uint64_t i) {
79
79
  return i & 0x7f;
80
80
  }
81
81
 
82
- static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, size_t length) {
83
- uint64_t count = 0;
84
- for (size_t i = 0; i < length; i++) {
82
+ static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, uint32_t length) {
83
+ uint32_t count = 0;
84
+ for (uint32_t i = 0; i < length; i++) {
85
85
  count += warren_bit_count(array[i]);
86
86
  }
87
87
  return count;
@@ -91,13 +91,13 @@ static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, si
91
91
 
92
92
  #define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
93
93
 
94
- static inline uint64_t count_bits_set_in_matrix(const uint64_t* a, size_t length) {
94
+ static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
95
95
  if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
96
- uint64_t total = 0;
96
+ uint32_t total = 0;
97
97
  uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights;
98
98
  fours = twos = ones = 0;
99
99
 
100
- for (size_t i = 0; i <= length - 8; i = i + 8) {
100
+ for (uint32_t i = 0; i <= length - 8; i += 8) {
101
101
  CSA(twos_a, ones, ones, a[i+0], a[i+1]);
102
102
  CSA(twos_b, ones, ones, a[i+2], a[i+3]);
103
103
  CSA(fours_a, twos, twos, twos_a, twos_b);
@@ -245,12 +245,12 @@ static inline double icon_exponential_approximation(double k, double c) {
245
245
  return (0.7940236163830469 * k * pow(2.0, c / k));
246
246
  }
247
247
 
248
- static inline double compute_icon_estimate(uint8_t lg_k, uint64_t c) {
248
+ static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) {
249
249
  if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
250
250
  if (c < 2) return ((c == 0) ? 0.0 : 1.0);
251
- const size_t k = 1 << lg_k;
252
- const double double_k = k;
253
- const double double_c = c;
251
+ const uint32_t k = 1 << lg_k;
252
+ const double double_k = static_cast<double>(k);
253
+ const double double_c = static_cast<double>(c);
254
254
  // Differing thresholds ensure that the approximated estimator is monotonically increasing.
255
255
  const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
256
256
  if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
@@ -29,11 +29,11 @@
29
29
 
30
30
  namespace datasketches {
31
31
 
32
- static const uint64_t U32_TABLE_UPSIZE_NUMER = 3LL;
33
- static const uint64_t U32_TABLE_UPSIZE_DENOM = 4LL;
32
+ static const uint32_t U32_TABLE_UPSIZE_NUMER = 3LL;
33
+ static const uint32_t U32_TABLE_UPSIZE_DENOM = 4LL;
34
34
 
35
- static const uint64_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
36
- static const uint64_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
35
+ static const uint32_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
36
+ static const uint32_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
37
37
 
38
38
  template<typename A>
39
39
  class u32_table {
@@ -42,7 +42,7 @@ public:
42
42
  u32_table(const A& allocator);
43
43
  u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
44
44
 
45
- inline size_t get_num_items() const;
45
+ inline uint32_t get_num_items() const;
46
46
  inline const uint32_t* get_slots() const;
47
47
  inline uint8_t get_lg_size() const;
48
48
  inline void clear();
@@ -52,7 +52,7 @@ public:
52
52
  // returns true iff the item was present and was therefore removed from the table
53
53
  inline bool maybe_delete(uint32_t item);
54
54
 
55
- static u32_table make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k, const A& allocator);
55
+ static u32_table make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator);
56
56
 
57
57
  vector_u32<A> unwrapping_get_items() const;
58
58
 
@@ -69,10 +69,10 @@ private:
69
69
 
70
70
  uint8_t lg_size; // log2 of number of slots
71
71
  uint8_t num_valid_bits;
72
- size_t num_items;
72
+ uint32_t num_items;
73
73
  vector_u32<A> slots;
74
74
 
75
- inline size_t lookup(uint32_t item) const;
75
+ inline uint32_t lookup(uint32_t item) const;
76
76
  inline void must_insert(uint32_t item);
77
77
  inline void rebuild(uint8_t new_lg_size);
78
78
  };
@@ -41,14 +41,14 @@ u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& alloca
41
41
  lg_size(lg_size),
42
42
  num_valid_bits(num_valid_bits),
43
43
  num_items(0),
44
- slots(1 << lg_size, UINT32_MAX, allocator)
44
+ slots(1ULL << lg_size, UINT32_MAX, allocator)
45
45
  {
46
46
  if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
47
47
  if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
48
48
  }
49
49
 
50
50
  template<typename A>
51
- size_t u32_table<A>::get_num_items() const {
51
+ uint32_t u32_table<A>::get_num_items() const {
52
52
  return num_items;
53
53
  }
54
54
 
@@ -70,7 +70,7 @@ void u32_table<A>::clear() {
70
70
 
71
71
  template<typename A>
72
72
  bool u32_table<A>::maybe_insert(uint32_t item) {
73
- const size_t index = lookup(item);
73
+ const uint32_t index = lookup(item);
74
74
  if (slots[index] == item) return false;
75
75
  if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
76
76
  slots[index] = item;
@@ -83,7 +83,7 @@ bool u32_table<A>::maybe_insert(uint32_t item) {
83
83
 
84
84
  template<typename A>
85
85
  bool u32_table<A>::maybe_delete(uint32_t item) {
86
- const size_t index = lookup(item);
86
+ const uint32_t index = lookup(item);
87
87
  if (slots[index] == UINT32_MAX) return false;
88
88
  if (slots[index] != item) throw std::logic_error("item does not exist");
89
89
  if (num_items == 0) throw std::logic_error("delete error");
@@ -110,7 +110,7 @@ bool u32_table<A>::maybe_delete(uint32_t item) {
110
110
 
111
111
  // this one is specifically tailored to be a part of fm85 decompression scheme
112
112
  template<typename A>
113
- u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k, const A& allocator) {
113
+ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator) {
114
114
  uint8_t lg_num_slots = 2;
115
115
  while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
116
116
  u32_table<A> table(lg_num_slots, 6 + lg_k, allocator);
@@ -124,11 +124,11 @@ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pai
124
124
  }
125
125
 
126
126
  template<typename A>
127
- size_t u32_table<A>::lookup(uint32_t item) const {
128
- const size_t size = 1 << lg_size;
129
- const size_t mask = size - 1;
127
+ uint32_t u32_table<A>::lookup(uint32_t item) const {
128
+ const uint32_t size = 1 << lg_size;
129
+ const uint32_t mask = size - 1;
130
130
  const uint8_t shift = num_valid_bits - lg_size;
131
- size_t probe = item >> shift;
131
+ uint32_t probe = item >> shift;
132
132
  if (probe > mask) throw std::logic_error("probe out of range");
133
133
  while (slots[probe] != item && slots[probe] != UINT32_MAX) {
134
134
  probe = (probe + 1) & mask;
@@ -139,7 +139,7 @@ size_t u32_table<A>::lookup(uint32_t item) const {
139
139
  // counts and resizing must be handled by the caller
140
140
  template<typename A>
141
141
  void u32_table<A>::must_insert(uint32_t item) {
142
- const size_t index = lookup(item);
142
+ const uint32_t index = lookup(item);
143
143
  if (slots[index] == item) throw std::logic_error("item exists");
144
144
  if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
145
145
  slots[index] = item;
@@ -148,13 +148,13 @@ void u32_table<A>::must_insert(uint32_t item) {
148
148
  template<typename A>
149
149
  void u32_table<A>::rebuild(uint8_t new_lg_size) {
150
150
  if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
151
- const size_t old_size = 1 << lg_size;
152
- const size_t new_size = 1 << new_lg_size;
151
+ const uint32_t old_size = 1 << lg_size;
152
+ const uint32_t new_size = 1 << new_lg_size;
153
153
  if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
154
154
  vector_u32<A> old_slots = std::move(slots);
155
155
  slots = vector_u32<A>(new_size, UINT32_MAX, old_slots.get_allocator());
156
156
  lg_size = new_lg_size;
157
- for (size_t i = 0; i < old_size; i++) {
157
+ for (uint32_t i = 0; i < old_size; i++) {
158
158
  if (old_slots[i] != UINT32_MAX) {
159
159
  must_insert(old_slots[i]);
160
160
  }
@@ -170,7 +170,7 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
170
170
  template<typename A>
171
171
  vector_u32<A> u32_table<A>::unwrapping_get_items() const {
172
172
  if (num_items == 0) return vector_u32<A>(slots.get_allocator());
173
- const size_t table_size = 1 << lg_size;
173
+ const uint32_t table_size = 1 << lg_size;
174
174
  vector_u32<A> result(num_items, 0, slots.get_allocator());
175
175
  size_t i = 0;
176
176
  size_t l = 0;
@@ -27,38 +27,38 @@ namespace datasketches {
27
27
  typedef u32_table<std::allocator<void>> table;
28
28
 
29
29
  TEST_CASE("cpc sketch: compress and decompress pairs", "[cpc_sketch]") {
30
- const int N = 200;
31
- const int MAXWORDS = 1000;
30
+ const size_t N = 200;
31
+ const size_t MAXWORDS = 1000;
32
32
 
33
33
  HashState twoHashes;
34
34
  uint32_t pairArray[N];
35
35
  uint32_t pairArray2[N];
36
36
  uint64_t value = 35538947; // some arbitrary starting value
37
37
  const uint64_t golden64 = 0x9e3779b97f4a7c13ULL; // the golden ratio
38
- for (int i = 0; i < N; i++) {
38
+ for (size_t i = 0; i < N; i++) {
39
39
  MurmurHash3_x64_128(&value, sizeof(value), 0, twoHashes);
40
40
  uint32_t rand = twoHashes.h1 & 0xffff;
41
41
  pairArray[i] = rand;
42
42
  value += golden64;
43
43
  }
44
44
  //table::knuth_shell_sort3(pairArray, 0, N - 1); // unsigned numerical sort
45
- std::sort(pairArray, &pairArray[N]);
45
+ std::sort(pairArray, pairArray + N);
46
46
  uint32_t prev = UINT32_MAX;
47
- int nxt = 0;
48
- for (int i = 0; i < N; i++) { // uniquify
47
+ uint32_t nxt = 0;
48
+ for (size_t i = 0; i < N; i++) { // uniquify
49
49
  if (pairArray[i] != prev) {
50
50
  prev = pairArray[i];
51
51
  pairArray[nxt++] = pairArray[i];
52
52
  }
53
53
  }
54
- int numPairs = nxt;
54
+ uint32_t numPairs = nxt;
55
55
 
56
56
  uint32_t compressedWords[MAXWORDS];
57
57
 
58
- for (size_t numBaseBits = 0; numBaseBits <= 11; numBaseBits++) {
59
- size_t numWordsWritten = get_compressor<std::allocator<void>>().low_level_compress_pairs(pairArray, numPairs, numBaseBits, compressedWords);
58
+ for (uint8_t numBaseBits = 0; numBaseBits <= 11; numBaseBits++) {
59
+ uint32_t numWordsWritten = get_compressor<std::allocator<void>>().low_level_compress_pairs(pairArray, numPairs, numBaseBits, compressedWords);
60
60
  get_compressor<std::allocator<void>>().low_level_uncompress_pairs(pairArray2, numPairs, numBaseBits, compressedWords, numWordsWritten);
61
- for (int i = 0; i < numPairs; i++) {
61
+ for (size_t i = 0; i < numPairs; i++) {
62
62
  REQUIRE(pairArray[i] == pairArray2[i]);
63
63
  }
64
64
  }
@@ -25,6 +25,7 @@
25
25
  #include <catch.hpp>
26
26
 
27
27
  #include "cpc_sketch.hpp"
28
+ #include "cpc_union.hpp"
28
29
  #include "test_allocator.hpp"
29
30
 
30
31
  namespace datasketches {
@@ -234,4 +235,20 @@ TEST_CASE("cpc sketch allocation: serialize deserialize sliding, bytes", "[cpc_s
234
235
  REQUIRE(test_allocator_net_allocations == 0);
235
236
  }
236
237
 
238
+ using cpc_union_test_alloc = cpc_union_alloc<test_allocator<uint8_t>>;
239
+
240
+ TEST_CASE("cpc sketch allocation: union") {
241
+ cpc_sketch_test_alloc s1(11, DEFAULT_SEED, 0);
242
+ s1.update(1);
243
+
244
+ cpc_sketch_test_alloc s2(11, DEFAULT_SEED, 0);
245
+ s2.update(2);
246
+
247
+ cpc_union_test_alloc u(11, DEFAULT_SEED, 0);
248
+ u.update(s1);
249
+ u.update(s2);
250
+ auto s3 = u.get_result();
251
+ REQUIRE_FALSE(s3.is_empty());
252
+ }
253
+
237
254
  } /* namespace datasketches */
@@ -283,6 +283,26 @@ TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
283
283
  REQUIRE(deserialized.validate());
284
284
  }
285
285
 
286
+ TEST_CASE("cpc sketch: serialize deserialize sliding huge", "[cpc_sketch]") {
287
+ cpc_sketch sketch(26);
288
+ const int n = 10000000;
289
+ for (int i = 0; i < n; i++) sketch.update(i);
290
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.001));
291
+ auto bytes = sketch.serialize();
292
+ cpc_sketch deserialized = cpc_sketch::deserialize(bytes.data(), bytes.size());
293
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
294
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
295
+ REQUIRE(deserialized.validate());
296
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 7), std::out_of_range);
297
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 15), std::out_of_range);
298
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
299
+
300
+ // updating again with the same values should not change the sketch
301
+ for (int i = 0; i < n; i++) deserialized.update(i);
302
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
303
+ REQUIRE(deserialized.validate());
304
+ }
305
+
286
306
  TEST_CASE("cpc sketch: copy", "[cpc_sketch]") {
287
307
  cpc_sketch s1(11);
288
308
  s1.update(1);
@@ -378,4 +398,9 @@ TEST_CASE("cpc sketch: update string equivalence", "[cpc_sketch]") {
378
398
  REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
379
399
  }
380
400
 
401
+ TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
402
+ REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
403
+ REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
404
+ }
405
+
381
406
  } /* namespace datasketches */
@@ -81,7 +81,7 @@ TEST_CASE("cpc union: large", "[cpc_union]") {
81
81
  cpc_union u(11);
82
82
  for (int i = 0; i < 1000; i++) {
83
83
  cpc_sketch tmp(11);
84
- for (int i = 0; i < 10000; i++) {
84
+ for (int j = 0; j < 10000; j++) {
85
85
  s.update(key);
86
86
  tmp.update(key);
87
87
  key++;
@@ -32,23 +32,13 @@ target_include_directories(fi
32
32
  target_link_libraries(fi INTERFACE common)
33
33
  target_compile_features(fi INTERFACE cxx_std_11)
34
34
 
35
- set(fi_HEADERS "")
36
- list(APPEND fi_HEADERS "include/frequent_items_sketch.hpp")
37
- list(APPEND fi_HEADERS "include/frequent_items_sketch_impl.hpp")
38
- list(APPEND fi_HEADERS "include/reverse_purge_hash_map.hpp")
39
- list(APPEND fi_HEADERS "include/reverse_purge_hash_map_impl.hpp")
40
-
41
35
  install(TARGETS fi
42
36
  EXPORT ${PROJECT_NAME}
43
37
  )
44
38
 
45
- install(FILES ${fi_HEADERS}
39
+ install(FILES
40
+ include/frequent_items_sketch.hpp
41
+ include/frequent_items_sketch_impl.hpp
42
+ include/reverse_purge_hash_map.hpp
43
+ include/reverse_purge_hash_map_impl.hpp
46
44
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
47
-
48
- target_sources(fi
49
- INTERFACE
50
- ${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch.hpp
51
- ${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch_impl.hpp
52
- ${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map.hpp
53
- ${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map_impl.hpp
54
- )