datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -23,8 +23,8 @@
23
23
  namespace datasketches {
24
24
 
25
25
  template<typename A>
26
- theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
27
- state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
26
+ theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator):
27
+ state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -38,14 +38,17 @@ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
38
38
  return state_.get_result(ordered);
39
39
  }
40
40
 
41
+ template<typename A>
42
+ void theta_union_alloc<A>::reset() {
43
+ state_.reset();
44
+ }
45
+
41
46
  template<typename A>
42
47
  theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
43
48
 
44
49
  template<typename A>
45
50
  auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
46
- return theta_union_alloc(
47
- this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
48
- this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
51
+ return theta_union_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
49
52
  }
50
53
 
51
54
  } /* namespace datasketches */
@@ -40,8 +40,8 @@ struct theta_update_sketch_base {
40
40
  using resize_factor = theta_constants::resize_factor;
41
41
  using comparator = compare_by_key<ExtractKey>;
42
42
 
43
- theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
44
- uint64_t seed, const Allocator& allocator, bool is_empty = true);
43
+ theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
44
+ uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
45
45
  theta_update_sketch_base(const theta_update_sketch_base& other);
46
46
  theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
47
47
  ~theta_update_sketch_base();
@@ -53,6 +53,8 @@ struct theta_update_sketch_base {
53
53
  inline uint64_t hash_and_screen(const void* data, size_t length);
54
54
 
55
55
  inline std::pair<iterator, bool> find(uint64_t key) const;
56
+ static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
57
+
56
58
 
57
59
  template<typename FwdEntry>
58
60
  inline void insert(iterator it, FwdEntry&& entry);
@@ -73,6 +75,7 @@ struct theta_update_sketch_base {
73
75
  uint8_t lg_cur_size_;
74
76
  uint8_t lg_nom_size_;
75
77
  resize_factor rf_;
78
+ float p_;
76
79
  uint32_t num_entries_;
77
80
  uint64_t theta_;
78
81
  uint64_t seed_;
@@ -81,6 +84,7 @@ struct theta_update_sketch_base {
81
84
  void resize();
82
85
  void rebuild();
83
86
  void trim();
87
+ void reset();
84
88
 
85
89
  static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
86
90
  static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
@@ -92,11 +96,14 @@ struct theta_update_sketch_base {
92
96
  template<typename Derived, typename Allocator>
93
97
  class theta_base_builder {
94
98
  public:
99
+ // TODO: Redundant and deprecated. Will be removed in next major version release.
95
100
  using resize_factor = theta_constants::resize_factor;
96
101
  static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
97
102
  static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
98
- static const uint8_t DEFAULT_LG_K = 12;
99
- static const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
103
+ // TODO: The following defaults are redundant and deprecated. Will be removed in the
104
+ // next major version release
105
+ static const uint8_t DEFAULT_LG_K = theta_constants::DEFAULT_LG_K;
106
+ static const resize_factor DEFAULT_RESIZE_FACTOR = theta_constants::DEFAULT_RESIZE_FACTOR;
100
107
 
101
108
  /**
102
109
  * Creates and instance of the builder with default parameters.
@@ -144,7 +151,6 @@ protected:
144
151
 
145
152
  uint64_t starting_theta() const;
146
153
  uint8_t starting_lg_size() const;
147
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
148
154
  };
149
155
 
150
156
  // key extractor
@@ -24,22 +24,25 @@
24
24
  #include <sstream>
25
25
  #include <algorithm>
26
26
 
27
+ #include "theta_helpers.hpp"
28
+
27
29
  namespace datasketches {
28
30
 
29
31
  template<typename EN, typename EK, typename A>
30
- theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
32
+ theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
31
33
  allocator_(allocator),
32
34
  is_empty_(is_empty),
33
35
  lg_cur_size_(lg_cur_size),
34
36
  lg_nom_size_(lg_nom_size),
35
37
  rf_(rf),
38
+ p_(p),
36
39
  num_entries_(0),
37
40
  theta_(theta),
38
41
  seed_(seed),
39
42
  entries_(nullptr)
40
43
  {
41
44
  if (lg_cur_size > 0) {
42
- const size_t size = 1 << lg_cur_size;
45
+ const size_t size = 1ULL << lg_cur_size;
43
46
  entries_ = allocator_.allocate(size);
44
47
  for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
45
48
  }
@@ -52,13 +55,14 @@ is_empty_(other.is_empty_),
52
55
  lg_cur_size_(other.lg_cur_size_),
53
56
  lg_nom_size_(other.lg_nom_size_),
54
57
  rf_(other.rf_),
58
+ p_(other.p_),
55
59
  num_entries_(other.num_entries_),
56
60
  theta_(other.theta_),
57
61
  seed_(other.seed_),
58
62
  entries_(nullptr)
59
63
  {
60
64
  if (other.entries_ != nullptr) {
61
- const size_t size = 1 << lg_cur_size_;
65
+ const size_t size = 1ULL << lg_cur_size_;
62
66
  entries_ = allocator_.allocate(size);
63
67
  for (size_t i = 0; i < size; ++i) {
64
68
  if (EK()(other.entries_[i]) != 0) {
@@ -77,6 +81,7 @@ is_empty_(other.is_empty_),
77
81
  lg_cur_size_(other.lg_cur_size_),
78
82
  lg_nom_size_(other.lg_nom_size_),
79
83
  rf_(other.rf_),
84
+ p_(other.p_),
80
85
  num_entries_(other.num_entries_),
81
86
  theta_(other.theta_),
82
87
  seed_(other.seed_),
@@ -89,7 +94,7 @@ template<typename EN, typename EK, typename A>
89
94
  theta_update_sketch_base<EN, EK, A>::~theta_update_sketch_base()
90
95
  {
91
96
  if (entries_ != nullptr) {
92
- const size_t size = 1 << lg_cur_size_;
97
+ const size_t size = 1ULL << lg_cur_size_;
93
98
  for (size_t i = 0; i < size; ++i) {
94
99
  if (EK()(entries_[i]) != 0) entries_[i].~EN();
95
100
  }
@@ -105,6 +110,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
105
110
  std::swap(lg_cur_size_, copy.lg_cur_size_);
106
111
  std::swap(lg_nom_size_, copy.lg_nom_size_);
107
112
  std::swap(rf_, copy.rf_);
113
+ std::swap(p_, copy.p_);
108
114
  std::swap(num_entries_, copy.num_entries_);
109
115
  std::swap(theta_, copy.theta_);
110
116
  std::swap(seed_, copy.seed_);
@@ -119,6 +125,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
119
125
  std::swap(lg_cur_size_, other.lg_cur_size_);
120
126
  std::swap(lg_nom_size_, other.lg_nom_size_);
121
127
  std::swap(rf_, other.rf_);
128
+ std::swap(p_, other.p_);
122
129
  std::swap(num_entries_, other.num_entries_);
123
130
  std::swap(theta_, other.theta_);
124
131
  std::swap(seed_, other.seed_);
@@ -136,18 +143,23 @@ uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data,
136
143
 
137
144
  template<typename EN, typename EK, typename A>
138
145
  auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
139
- const size_t size = 1 << lg_cur_size_;
140
- const size_t mask = size - 1;
141
- const uint32_t stride = get_stride(key, lg_cur_size_);
146
+ return find(entries_, lg_cur_size_, key);
147
+ }
148
+
149
+ template<typename EN, typename EK, typename A>
150
+ auto theta_update_sketch_base<EN, EK, A>::find(EN* entries, uint8_t lg_size, uint64_t key) -> std::pair<iterator, bool> {
151
+ const uint32_t size = 1 << lg_size;
152
+ const uint32_t mask = size - 1;
153
+ const uint32_t stride = get_stride(key, lg_size);
142
154
  uint32_t index = static_cast<uint32_t>(key) & mask;
143
155
  // search for duplicate or zero
144
156
  const uint32_t loop_index = index;
145
157
  do {
146
- const uint64_t probe = EK()(entries_[index]);
158
+ const uint64_t probe = EK()(entries[index]);
147
159
  if (probe == 0) {
148
- return std::pair<iterator, bool>(&entries_[index], false);
160
+ return std::pair<iterator, bool>(&entries[index], false);
149
161
  } else if (probe == key) {
150
- return std::pair<iterator, bool>(&entries_[index], true);
162
+ return std::pair<iterator, bool>(&entries[index], true);
151
163
  }
152
164
  index = (index + stride) & mask;
153
165
  } while (index != loop_index);
@@ -175,13 +187,13 @@ auto theta_update_sketch_base<EN, EK, A>::begin() const -> iterator {
175
187
 
176
188
  template<typename EN, typename EK, typename A>
177
189
  auto theta_update_sketch_base<EN, EK, A>::end() const -> iterator {
178
- return &entries_[1 << lg_cur_size_];
190
+ return &entries_[1ULL << lg_cur_size_];
179
191
  }
180
192
 
181
193
  template<typename EN, typename EK, typename A>
182
194
  uint32_t theta_update_sketch_base<EN, EK, A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
183
195
  const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
184
- return std::floor(fraction * (1 << lg_cur_size));
196
+ return static_cast<uint32_t>(std::floor(fraction * (1 << lg_cur_size)));
185
197
  }
186
198
 
187
199
  template<typename EN, typename EK, typename A>
@@ -192,29 +204,29 @@ uint32_t theta_update_sketch_base<EN, EK, A>::get_stride(uint64_t key, uint8_t l
192
204
 
193
205
  template<typename EN, typename EK, typename A>
194
206
  void theta_update_sketch_base<EN, EK, A>::resize() {
195
- const size_t old_size = 1 << lg_cur_size_;
196
- const uint8_t lg_tgt_size = lg_nom_size_ + 1;
197
- const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
198
- lg_cur_size_ += factor;
199
- const size_t new_size = 1 << lg_cur_size_;
200
- EN* old_entries = entries_;
201
- entries_ = allocator_.allocate(new_size);
202
- for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
203
- num_entries_ = 0;
207
+ const size_t old_size = 1ULL << lg_cur_size_;
208
+ const uint8_t lg_new_size = std::min<uint8_t>(lg_cur_size_ + static_cast<uint8_t>(rf_), lg_nom_size_ + 1);
209
+ const size_t new_size = 1ULL << lg_new_size;
210
+ EN* new_entries = allocator_.allocate(new_size);
211
+ for (size_t i = 0; i < new_size; ++i) EK()(new_entries[i]) = 0;
204
212
  for (size_t i = 0; i < old_size; ++i) {
205
- const uint64_t key = EK()(old_entries[i]);
213
+ const uint64_t key = EK()(entries_[i]);
206
214
  if (key != 0) {
207
- insert(find(key).first, std::move(old_entries[i])); // consider a special insert with no comparison
208
- old_entries[i].~EN();
215
+ // always finds an empty slot in a larger table
216
+ new (find(new_entries, lg_new_size, key).first) EN(std::move(entries_[i]));
217
+ entries_[i].~EN();
218
+ EK()(entries_[i]) = 0;
209
219
  }
210
220
  }
211
- allocator_.deallocate(old_entries, old_size);
221
+ std::swap(entries_, new_entries);
222
+ lg_cur_size_ = lg_new_size;
223
+ allocator_.deallocate(new_entries, old_size);
212
224
  }
213
225
 
214
226
  // assumes number of entries > nominal size
215
227
  template<typename EN, typename EK, typename A>
216
228
  void theta_update_sketch_base<EN, EK, A>::rebuild() {
217
- const size_t size = 1 << lg_cur_size_;
229
+ const size_t size = 1ULL << lg_cur_size_;
218
230
  const uint32_t nominal_size = 1 << lg_nom_size_;
219
231
 
220
232
  // empty entries have uninitialized payloads
@@ -227,10 +239,10 @@ void theta_update_sketch_base<EN, EK, A>::rebuild() {
227
239
  const size_t num_old_entries = num_entries_;
228
240
  entries_ = allocator_.allocate(size);
229
241
  for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
230
- num_entries_ = 0;
242
+ num_entries_ = nominal_size;
231
243
  // relies on consolidating non-empty entries to the front
232
244
  for (size_t i = 0; i < nominal_size; ++i) {
233
- insert(find(EK()(old_entries[i])).first, std::move(old_entries[i])); // consider a special insert with no comparison
245
+ new (find(EK()(old_entries[i])).first) EN(std::move(old_entries[i]));
234
246
  old_entries[i].~EN();
235
247
  }
236
248
  for (size_t i = nominal_size; i < num_old_entries; ++i) old_entries[i].~EN();
@@ -242,6 +254,29 @@ void theta_update_sketch_base<EN, EK, A>::trim() {
242
254
  if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
243
255
  }
244
256
 
257
+ template<typename EN, typename EK, typename A>
258
+ void theta_update_sketch_base<EN, EK, A>::reset() {
259
+ const size_t cur_size = 1ULL << lg_cur_size_;
260
+ for (size_t i = 0; i < cur_size; ++i) {
261
+ if (EK()(entries_[i]) != 0) {
262
+ entries_[i].~EN();
263
+ EK()(entries_[i]) = 0;
264
+ }
265
+ }
266
+ const uint8_t starting_lg_size = theta_build_helper<true>::starting_sub_multiple(
267
+ lg_nom_size_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
268
+ if (starting_lg_size != lg_cur_size_) {
269
+ allocator_.deallocate(entries_, cur_size);
270
+ lg_cur_size_ = starting_lg_size;
271
+ const size_t new_size = 1ULL << starting_lg_size;
272
+ entries_ = allocator_.allocate(new_size);
273
+ for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
274
+ }
275
+ num_entries_ = 0;
276
+ theta_ = theta_build_helper<true>::starting_theta_from_p(p_);
277
+ is_empty_ = true;
278
+ }
279
+
245
280
  template<typename EN, typename EK, typename A>
246
281
  void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
247
282
  // find the first empty slot
@@ -266,7 +301,11 @@ void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, siz
266
301
 
267
302
  template<typename Derived, typename Allocator>
268
303
  theta_base_builder<Derived, Allocator>::theta_base_builder(const Allocator& allocator):
269
- allocator_(allocator), lg_k_(DEFAULT_LG_K), rf_(DEFAULT_RESIZE_FACTOR), p_(1), seed_(DEFAULT_SEED) {}
304
+ allocator_(allocator),
305
+ lg_k_(theta_constants::DEFAULT_LG_K),
306
+ rf_(theta_constants::DEFAULT_RESIZE_FACTOR),
307
+ p_(1),
308
+ seed_(DEFAULT_SEED) {}
270
309
 
271
310
  template<typename Derived, typename Allocator>
272
311
  Derived& theta_base_builder<Derived, Allocator>::set_lg_k(uint8_t lg_k) {
@@ -301,18 +340,12 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
301
340
 
302
341
  template<typename Derived, typename Allocator>
303
342
  uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
304
- if (p_ < 1) return theta_constants::MAX_THETA * p_;
305
- return theta_constants::MAX_THETA;
343
+ return theta_build_helper<true>::starting_theta_from_p(p_);
306
344
  }
307
345
 
308
346
  template<typename Derived, typename Allocator>
309
347
  uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
310
- return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
311
- }
312
-
313
- template<typename Derived, typename Allocator>
314
- uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
315
- return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
348
+ return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
316
349
  }
317
350
 
318
351
  // iterator
@@ -43,4 +43,5 @@ target_sources(theta_test
43
43
  theta_intersection_test.cpp
44
44
  theta_a_not_b_test.cpp
45
45
  theta_jaccard_similarity_test.cpp
46
+ theta_setop_test.cpp
46
47
  )
@@ -37,7 +37,7 @@ TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
37
37
  TEST_CASE("theta a-not-b: non empty no retained keys", "[theta_a_not_b]") {
38
38
  update_theta_sketch a = update_theta_sketch::builder().build();
39
39
  a.update(1);
40
- update_theta_sketch b = update_theta_sketch::builder().set_p(0.001).build();
40
+ update_theta_sketch b = update_theta_sketch::builder().set_p(0.001f).build();
41
41
  theta_a_not_b a_not_b;
42
42
 
43
43
  // B is still empty
@@ -167,6 +167,28 @@ TEST_CASE("theta a-not-b: estimation mode half overlap", "[theta_a_not_b]") {
167
167
  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
168
168
  }
169
169
 
170
+ TEST_CASE("theta a-not-b: estimation mode half overlap wrapped compact", "[theta_a_not_b]") {
171
+ update_theta_sketch a = update_theta_sketch::builder().build();
172
+ int value = 0;
173
+ for (int i = 0; i < 10000; i++) a.update(value++);
174
+ auto bytes_a = a.compact().serialize();
175
+
176
+ update_theta_sketch b = update_theta_sketch::builder().build();
177
+ value = 5000;
178
+ for (int i = 0; i < 10000; i++) b.update(value++);
179
+ auto bytes_b = b.compact().serialize();
180
+
181
+ theta_a_not_b a_not_b;
182
+
183
+ auto result = a_not_b.compute(
184
+ wrapped_compact_theta_sketch::wrap(bytes_a.data(), bytes_a.size()),
185
+ wrapped_compact_theta_sketch::wrap(bytes_b.data(), bytes_b.size())
186
+ );
187
+ REQUIRE_FALSE(result.is_empty());
188
+ REQUIRE(result.is_estimation_mode());
189
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
190
+ }
191
+
170
192
  TEST_CASE("theta a-not-b: estimation mode disjoint", "[theta_a_not_b]") {
171
193
  update_theta_sketch a = update_theta_sketch::builder().build();
172
194
  int value = 0;
@@ -48,7 +48,7 @@ TEST_CASE("theta intersection: empty", "[theta_intersection]") {
48
48
  }
49
49
 
50
50
  TEST_CASE("theta intersection: non empty no retained keys", "[theta_intersection]") {
51
- update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001).build();
51
+ update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001f).build();
52
52
  sketch.update(1);
53
53
  theta_intersection intersection;
54
54
  intersection.update(sketch);
@@ -174,6 +174,26 @@ TEST_CASE("theta intersection: estimation mode half overlap ordered", "[theta_in
174
174
  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
175
175
  }
176
176
 
177
+ TEST_CASE("theta intersection: estimation mode half overlap ordered wrapped compact", "[theta_intersection]") {
178
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
179
+ int value = 0;
180
+ for (int i = 0; i < 10000; i++) sketch1.update(value++);
181
+ auto bytes1 = sketch1.compact().serialize();
182
+
183
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
184
+ value = 5000;
185
+ for (int i = 0; i < 10000; i++) sketch2.update(value++);
186
+ auto bytes2 = sketch2.compact().serialize();
187
+
188
+ theta_intersection intersection;
189
+ intersection.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
190
+ intersection.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
191
+ compact_theta_sketch result = intersection.get_result();
192
+ REQUIRE_FALSE(result.is_empty());
193
+ REQUIRE(result.is_estimation_mode());
194
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
195
+ }
196
+
177
197
  TEST_CASE("theta intersection: estimation mode disjoint unordered", "[theta_intersection]") {
178
198
  update_theta_sketch sketch1 = update_theta_sketch::builder().build();
179
199
  int value = 0;
@@ -100,6 +100,28 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
100
100
  REQUIRE(jc[2] == Approx(0.33).margin(0.01));
101
101
  }
102
102
 
103
+ TEST_CASE("theta jaccard: half overlap estimation mode custom seed", "[theta_sketch]") {
104
+ const uint64_t seed = 123;
105
+ auto sk_a = update_theta_sketch::builder().set_seed(seed).build();
106
+ auto sk_b = update_theta_sketch::builder().set_seed(seed).build();
107
+ for (int i = 0; i < 10000; ++i) {
108
+ sk_a.update(i);
109
+ sk_b.update(i + 5000);
110
+ }
111
+
112
+ // update sketches
113
+ auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b, seed);
114
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
115
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
116
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
117
+
118
+ // compact sketches
119
+ jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact(), seed);
120
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
121
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
122
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
123
+ }
124
+
103
125
  /**
104
126
  * The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
105
127
  * underlying sketch is about +/- 1.56%.
@@ -107,7 +129,7 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
107
129
  TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
108
130
  const int8_t min_lg_k = 12;
109
131
  const int u1 = 1 << 20;
110
- const int u2 = u1 * 0.95;
132
+ const int u2 = static_cast<int>(u1 * 0.95);
111
133
  const double threshold = 0.943;
112
134
 
113
135
  auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
@@ -120,6 +142,23 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
120
142
  REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
121
143
  }
122
144
 
145
+ TEST_CASE("theta jaccard: similarity test custom seed", "[theta_sketch]") {
146
+ const int8_t min_lg_k = 12;
147
+ const int u1 = 1 << 20;
148
+ const int u2 = static_cast<int>(u1 * 0.95);
149
+ const double threshold = 0.943;
150
+ const uint64_t seed = 1234;
151
+
152
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
153
+ for (int i = 0; i < u1; ++i) expected.update(i);
154
+
155
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
156
+ for (int i = 0; i < u2; ++i) actual.update(i);
157
+
158
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold, seed));
159
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold, seed));
160
+ }
161
+
123
162
  /**
124
163
  * The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
125
164
  * as the ratio of intersection to the union becomes a small number.
@@ -127,7 +166,7 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
127
166
  TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
128
167
  const int8_t min_lg_k = 12;
129
168
  const int u1 = 1 << 20;
130
- const int u2 = u1 * 0.05;
169
+ const int u2 = static_cast<int>(u1 * 0.05);
131
170
  const double threshold = 0.061;
132
171
 
133
172
  auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
@@ -140,4 +179,21 @@ TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
140
179
  REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
141
180
  }
142
181
 
182
+ TEST_CASE("theta jaccard: dissimilarity test custom seed", "[theta_sketch]") {
183
+ const int8_t min_lg_k = 12;
184
+ const int u1 = 1 << 20;
185
+ const int u2 = static_cast<int>(u1 * 0.05);
186
+ const double threshold = 0.061;
187
+ const uint64_t seed = 1234;
188
+
189
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
190
+ for (int i = 0; i < u1; ++i) expected.update(i);
191
+
192
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
193
+ for (int i = 0; i < u2; ++i) actual.update(i);
194
+
195
+ REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold, seed));
196
+ REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold, seed));
197
+ }
198
+
143
199
  } /* namespace datasketches */