datasketches 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -35,13 +35,13 @@ public:
35
35
  using CompactSketch = compact_theta_sketch_alloc<Allocator>;
36
36
  using resize_factor = theta_constants::resize_factor;
37
37
 
38
- struct pass_through_policy {
39
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
+ struct nop_policy {
39
+ void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
40
+ unused(internal_entry);
40
41
  unused(incoming_entry);
41
- return internal_entry;
42
42
  }
43
43
  };
44
- using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
44
+ using State = theta_union_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
45
45
 
46
46
  // No constructor here. Use builder instead.
47
47
  class builder;
@@ -43,7 +43,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
43
43
  if (sketch.get_theta64() < union_theta_) union_theta_ = sketch.get_theta64();
44
44
  for (auto& entry: sketch) {
45
45
  const uint64_t hash = EK()(entry);
46
- if (hash < union_theta_) {
46
+ if (hash < union_theta_ && hash < table_.theta_) {
47
47
  auto result = table_.find(hash);
48
48
  if (!result.second) {
49
49
  table_.insert(result.first, conditional_forward<SS>(entry));
@@ -24,7 +24,7 @@ namespace datasketches {
24
24
 
25
25
  template<typename A>
26
26
  theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
27
- state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
27
+ state_(lg_cur_size, lg_nom_size, rf, theta, seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -53,6 +53,8 @@ struct theta_update_sketch_base {
53
53
  inline uint64_t hash_and_screen(const void* data, size_t length);
54
54
 
55
55
  inline std::pair<iterator, bool> find(uint64_t key) const;
56
+ static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
57
+
56
58
 
57
59
  template<typename FwdEntry>
58
60
  inline void insert(iterator it, FwdEntry&& entry);
@@ -39,7 +39,7 @@ seed_(seed),
39
39
  entries_(nullptr)
40
40
  {
41
41
  if (lg_cur_size > 0) {
42
- const size_t size = 1 << lg_cur_size;
42
+ const size_t size = 1ULL << lg_cur_size;
43
43
  entries_ = allocator_.allocate(size);
44
44
  for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
45
45
  }
@@ -58,7 +58,7 @@ seed_(other.seed_),
58
58
  entries_(nullptr)
59
59
  {
60
60
  if (other.entries_ != nullptr) {
61
- const size_t size = 1 << lg_cur_size_;
61
+ const size_t size = 1ULL << lg_cur_size_;
62
62
  entries_ = allocator_.allocate(size);
63
63
  for (size_t i = 0; i < size; ++i) {
64
64
  if (EK()(other.entries_[i]) != 0) {
@@ -89,7 +89,7 @@ template<typename EN, typename EK, typename A>
89
89
  theta_update_sketch_base<EN, EK, A>::~theta_update_sketch_base()
90
90
  {
91
91
  if (entries_ != nullptr) {
92
- const size_t size = 1 << lg_cur_size_;
92
+ const size_t size = 1ULL << lg_cur_size_;
93
93
  for (size_t i = 0; i < size; ++i) {
94
94
  if (EK()(entries_[i]) != 0) entries_[i].~EN();
95
95
  }
@@ -136,18 +136,23 @@ uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data,
136
136
 
137
137
  template<typename EN, typename EK, typename A>
138
138
  auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
139
- const size_t size = 1 << lg_cur_size_;
140
- const size_t mask = size - 1;
141
- const uint32_t stride = get_stride(key, lg_cur_size_);
139
+ return find(entries_, lg_cur_size_, key);
140
+ }
141
+
142
+ template<typename EN, typename EK, typename A>
143
+ auto theta_update_sketch_base<EN, EK, A>::find(EN* entries, uint8_t lg_size, uint64_t key) -> std::pair<iterator, bool> {
144
+ const uint32_t size = 1 << lg_size;
145
+ const uint32_t mask = size - 1;
146
+ const uint32_t stride = get_stride(key, lg_size);
142
147
  uint32_t index = static_cast<uint32_t>(key) & mask;
143
148
  // search for duplicate or zero
144
149
  const uint32_t loop_index = index;
145
150
  do {
146
- const uint64_t probe = EK()(entries_[index]);
151
+ const uint64_t probe = EK()(entries[index]);
147
152
  if (probe == 0) {
148
- return std::pair<iterator, bool>(&entries_[index], false);
153
+ return std::pair<iterator, bool>(&entries[index], false);
149
154
  } else if (probe == key) {
150
- return std::pair<iterator, bool>(&entries_[index], true);
155
+ return std::pair<iterator, bool>(&entries[index], true);
151
156
  }
152
157
  index = (index + stride) & mask;
153
158
  } while (index != loop_index);
@@ -175,13 +180,13 @@ auto theta_update_sketch_base<EN, EK, A>::begin() const -> iterator {
175
180
 
176
181
  template<typename EN, typename EK, typename A>
177
182
  auto theta_update_sketch_base<EN, EK, A>::end() const -> iterator {
178
- return &entries_[1 << lg_cur_size_];
183
+ return &entries_[1ULL << lg_cur_size_];
179
184
  }
180
185
 
181
186
  template<typename EN, typename EK, typename A>
182
187
  uint32_t theta_update_sketch_base<EN, EK, A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
183
188
  const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
184
- return std::floor(fraction * (1 << lg_cur_size));
189
+ return static_cast<uint32_t>(std::floor(fraction * (1 << lg_cur_size)));
185
190
  }
186
191
 
187
192
  template<typename EN, typename EK, typename A>
@@ -192,29 +197,29 @@ uint32_t theta_update_sketch_base<EN, EK, A>::get_stride(uint64_t key, uint8_t l
192
197
 
193
198
  template<typename EN, typename EK, typename A>
194
199
  void theta_update_sketch_base<EN, EK, A>::resize() {
195
- const size_t old_size = 1 << lg_cur_size_;
196
- const uint8_t lg_tgt_size = lg_nom_size_ + 1;
197
- const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
198
- lg_cur_size_ += factor;
199
- const size_t new_size = 1 << lg_cur_size_;
200
- EN* old_entries = entries_;
201
- entries_ = allocator_.allocate(new_size);
202
- for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
203
- num_entries_ = 0;
200
+ const size_t old_size = 1ULL << lg_cur_size_;
201
+ const uint8_t lg_new_size = std::min<uint8_t>(lg_cur_size_ + static_cast<uint8_t>(rf_), lg_nom_size_ + 1);
202
+ const size_t new_size = 1ULL << lg_new_size;
203
+ EN* new_entries = allocator_.allocate(new_size);
204
+ for (size_t i = 0; i < new_size; ++i) EK()(new_entries[i]) = 0;
204
205
  for (size_t i = 0; i < old_size; ++i) {
205
- const uint64_t key = EK()(old_entries[i]);
206
+ const uint64_t key = EK()(entries_[i]);
206
207
  if (key != 0) {
207
- insert(find(key).first, std::move(old_entries[i])); // consider a special insert with no comparison
208
- old_entries[i].~EN();
208
+ // always finds an empty slot in a larger table
209
+ new (find(new_entries, lg_new_size, key).first) EN(std::move(entries_[i]));
210
+ entries_[i].~EN();
211
+ EK()(entries_[i]) = 0;
209
212
  }
210
213
  }
211
- allocator_.deallocate(old_entries, old_size);
214
+ std::swap(entries_, new_entries);
215
+ lg_cur_size_ = lg_new_size;
216
+ allocator_.deallocate(new_entries, old_size);
212
217
  }
213
218
 
214
219
  // assumes number of entries > nominal size
215
220
  template<typename EN, typename EK, typename A>
216
221
  void theta_update_sketch_base<EN, EK, A>::rebuild() {
217
- const size_t size = 1 << lg_cur_size_;
222
+ const size_t size = 1ULL << lg_cur_size_;
218
223
  const uint32_t nominal_size = 1 << lg_nom_size_;
219
224
 
220
225
  // empty entries have uninitialized payloads
@@ -227,10 +232,10 @@ void theta_update_sketch_base<EN, EK, A>::rebuild() {
227
232
  const size_t num_old_entries = num_entries_;
228
233
  entries_ = allocator_.allocate(size);
229
234
  for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
230
- num_entries_ = 0;
235
+ num_entries_ = nominal_size;
231
236
  // relies on consolidating non-empty entries to the front
232
237
  for (size_t i = 0; i < nominal_size; ++i) {
233
- insert(find(EK()(old_entries[i])).first, std::move(old_entries[i])); // consider a special insert with no comparison
238
+ new (find(EK()(old_entries[i])).first) EN(std::move(old_entries[i]));
234
239
  old_entries[i].~EN();
235
240
  }
236
241
  for (size_t i = nominal_size; i < num_old_entries; ++i) old_entries[i].~EN();
@@ -301,7 +306,7 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
301
306
 
302
307
  template<typename Derived, typename Allocator>
303
308
  uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
304
- if (p_ < 1) return theta_constants::MAX_THETA * p_;
309
+ if (p_ < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p_);
305
310
  return theta_constants::MAX_THETA;
306
311
  }
307
312
 
@@ -37,7 +37,7 @@ TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
37
37
  TEST_CASE("theta a-not-b: non empty no retained keys", "[theta_a_not_b]") {
38
38
  update_theta_sketch a = update_theta_sketch::builder().build();
39
39
  a.update(1);
40
- update_theta_sketch b = update_theta_sketch::builder().set_p(0.001).build();
40
+ update_theta_sketch b = update_theta_sketch::builder().set_p(0.001f).build();
41
41
  theta_a_not_b a_not_b;
42
42
 
43
43
  // B is still empty
@@ -167,6 +167,28 @@ TEST_CASE("theta a-not-b: estimation mode half overlap", "[theta_a_not_b]") {
167
167
  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
168
168
  }
169
169
 
170
+ TEST_CASE("theta a-not-b: estimation mode half overlap wrapped compact", "[theta_a_not_b]") {
171
+ update_theta_sketch a = update_theta_sketch::builder().build();
172
+ int value = 0;
173
+ for (int i = 0; i < 10000; i++) a.update(value++);
174
+ auto bytes_a = a.compact().serialize();
175
+
176
+ update_theta_sketch b = update_theta_sketch::builder().build();
177
+ value = 5000;
178
+ for (int i = 0; i < 10000; i++) b.update(value++);
179
+ auto bytes_b = b.compact().serialize();
180
+
181
+ theta_a_not_b a_not_b;
182
+
183
+ auto result = a_not_b.compute(
184
+ wrapped_compact_theta_sketch::wrap(bytes_a.data(), bytes_a.size()),
185
+ wrapped_compact_theta_sketch::wrap(bytes_b.data(), bytes_b.size())
186
+ );
187
+ REQUIRE_FALSE(result.is_empty());
188
+ REQUIRE(result.is_estimation_mode());
189
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
190
+ }
191
+
170
192
  TEST_CASE("theta a-not-b: estimation mode disjoint", "[theta_a_not_b]") {
171
193
  update_theta_sketch a = update_theta_sketch::builder().build();
172
194
  int value = 0;
@@ -48,7 +48,7 @@ TEST_CASE("theta intersection: empty", "[theta_intersection]") {
48
48
  }
49
49
 
50
50
  TEST_CASE("theta intersection: non empty no retained keys", "[theta_intersection]") {
51
- update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001).build();
51
+ update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001f).build();
52
52
  sketch.update(1);
53
53
  theta_intersection intersection;
54
54
  intersection.update(sketch);
@@ -174,6 +174,26 @@ TEST_CASE("theta intersection: estimation mode half overlap ordered", "[theta_in
174
174
  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
175
175
  }
176
176
 
177
+ TEST_CASE("theta intersection: estimation mode half overlap ordered wrapped compact", "[theta_intersection]") {
178
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
179
+ int value = 0;
180
+ for (int i = 0; i < 10000; i++) sketch1.update(value++);
181
+ auto bytes1 = sketch1.compact().serialize();
182
+
183
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
184
+ value = 5000;
185
+ for (int i = 0; i < 10000; i++) sketch2.update(value++);
186
+ auto bytes2 = sketch2.compact().serialize();
187
+
188
+ theta_intersection intersection;
189
+ intersection.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
190
+ intersection.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
191
+ compact_theta_sketch result = intersection.get_result();
192
+ REQUIRE_FALSE(result.is_empty());
193
+ REQUIRE(result.is_estimation_mode());
194
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
195
+ }
196
+
177
197
  TEST_CASE("theta intersection: estimation mode disjoint unordered", "[theta_intersection]") {
178
198
  update_theta_sketch sketch1 = update_theta_sketch::builder().build();
179
199
  int value = 0;
@@ -100,6 +100,28 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
100
100
  REQUIRE(jc[2] == Approx(0.33).margin(0.01));
101
101
  }
102
102
 
103
+ TEST_CASE("theta jaccard: half overlap estimation mode custom seed", "[theta_sketch]") {
104
+ const uint64_t seed = 123;
105
+ auto sk_a = update_theta_sketch::builder().set_seed(seed).build();
106
+ auto sk_b = update_theta_sketch::builder().set_seed(seed).build();
107
+ for (int i = 0; i < 10000; ++i) {
108
+ sk_a.update(i);
109
+ sk_b.update(i + 5000);
110
+ }
111
+
112
+ // update sketches
113
+ auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b, seed);
114
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
115
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
116
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
117
+
118
+ // compact sketches
119
+ jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact(), seed);
120
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
121
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
122
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
123
+ }
124
+
103
125
  /**
104
126
  * The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
105
127
  * underlying sketch is about +/- 1.56%.
@@ -107,7 +129,7 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
107
129
  TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
108
130
  const int8_t min_lg_k = 12;
109
131
  const int u1 = 1 << 20;
110
- const int u2 = u1 * 0.95;
132
+ const int u2 = static_cast<int>(u1 * 0.95);
111
133
  const double threshold = 0.943;
112
134
 
113
135
  auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
@@ -120,6 +142,23 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
120
142
  REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
121
143
  }
122
144
 
145
+ TEST_CASE("theta jaccard: similarity test custom seed", "[theta_sketch]") {
146
+ const int8_t min_lg_k = 12;
147
+ const int u1 = 1 << 20;
148
+ const int u2 = static_cast<int>(u1 * 0.95);
149
+ const double threshold = 0.943;
150
+ const uint64_t seed = 1234;
151
+
152
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
153
+ for (int i = 0; i < u1; ++i) expected.update(i);
154
+
155
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
156
+ for (int i = 0; i < u2; ++i) actual.update(i);
157
+
158
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold, seed));
159
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold, seed));
160
+ }
161
+
123
162
  /**
124
163
  * The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
125
164
  * as the ratio of intersection to the union becomes a small number.
@@ -127,7 +166,7 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
127
166
  TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
128
167
  const int8_t min_lg_k = 12;
129
168
  const int u1 = 1 << 20;
130
- const int u2 = u1 * 0.05;
169
+ const int u2 = static_cast<int>(u1 * 0.05);
131
170
  const double threshold = 0.061;
132
171
 
133
172
  auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
@@ -140,4 +179,21 @@ TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
140
179
  REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
141
180
  }
142
181
 
182
+ TEST_CASE("theta jaccard: dissimilarity test custom seed", "[theta_sketch]") {
183
+ const int8_t min_lg_k = 12;
184
+ const int u1 = 1 << 20;
185
+ const int u2 = static_cast<int>(u1 * 0.05);
186
+ const double threshold = 0.061;
187
+ const uint64_t seed = 1234;
188
+
189
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
190
+ for (int i = 0; i < u1; ++i) expected.update(i);
191
+
192
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
193
+ for (int i = 0; i < u2; ++i) actual.update(i);
194
+
195
+ REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold, seed));
196
+ REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold, seed));
197
+ }
198
+
143
199
  } /* namespace datasketches */
@@ -50,7 +50,7 @@ TEST_CASE("theta sketch: empty", "[theta_sketch]") {
50
50
  }
51
51
 
52
52
  TEST_CASE("theta sketch: non empty no retained keys", "[theta_sketch]") {
53
- update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001).build();
53
+ update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001f).build();
54
54
  update_sketch.update(1);
55
55
  //std::cerr << update_sketch.to_string();
56
56
  REQUIRE(update_sketch.get_num_retained() == 0);
@@ -238,4 +238,40 @@ TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[thet
238
238
  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
239
239
  }
240
240
 
241
+ TEST_CASE("theta sketch: conversion constructor and wrapped compact", "[theta_sketch]") {
242
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
243
+ const int n = 8192;
244
+ for (int i = 0; i < n; i++) update_sketch.update(i);
245
+
246
+ // unordered
247
+ auto unordered_compact1 = update_sketch.compact(false);
248
+ compact_theta_sketch unordered_compact2(update_sketch, false);
249
+ auto it = unordered_compact1.begin();
250
+ for (auto entry: unordered_compact2) {
251
+ REQUIRE(*it == entry);
252
+ ++it;
253
+ }
254
+
255
+ // ordered
256
+ auto ordered_compact1 = update_sketch.compact();
257
+ compact_theta_sketch ordered_compact2(update_sketch, true);
258
+ it = ordered_compact1.begin();
259
+ for (auto entry: ordered_compact2) {
260
+ REQUIRE(*it == entry);
261
+ ++it;
262
+ }
263
+
264
+ // wrapped compact
265
+ auto bytes = ordered_compact1.serialize();
266
+ auto ordered_compact3 = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
267
+ it = ordered_compact1.begin();
268
+ for (auto entry: ordered_compact3) {
269
+ REQUIRE(*it == entry);
270
+ ++it;
271
+ }
272
+
273
+ // seed mismatch
274
+ REQUIRE_THROWS_AS(wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), 0), std::invalid_argument);
275
+ }
276
+
241
277
  } /* namespace datasketches */
@@ -39,7 +39,7 @@ TEST_CASE("theta union: empty", "[theta_union]") {
39
39
  }
40
40
 
41
41
  TEST_CASE("theta union: non empty no retained keys", "[theta_union]") {
42
- update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001).build();
42
+ update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001f).build();
43
43
  update_sketch.update(1);
44
44
  theta_union u = theta_union::builder().build();
45
45
  u.update(update_sketch);
@@ -65,7 +65,27 @@ TEST_CASE("theta union: exact mode half overlap", "[theta_union]") {
65
65
  compact_theta_sketch sketch3 = u.get_result();
66
66
  REQUIRE_FALSE(sketch3.is_empty());
67
67
  REQUIRE_FALSE(sketch3.is_estimation_mode());
68
- REQUIRE(sketch3.get_estimate() == Approx(1500).margin(1500 * 0.01));
68
+ REQUIRE(sketch3.get_estimate() == 1500.0);
69
+ }
70
+
71
+ TEST_CASE("theta union: exact mode half overlap wrapped compact", "[theta_union]") {
72
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
73
+ int value = 0;
74
+ for (int i = 0; i < 1000; i++) sketch1.update(value++);
75
+ auto bytes1 = sketch1.compact().serialize();
76
+
77
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
78
+ value = 500;
79
+ for (int i = 0; i < 1000; i++) sketch2.update(value++);
80
+ auto bytes2 = sketch2.compact().serialize();
81
+
82
+ theta_union u = theta_union::builder().build();
83
+ u.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
84
+ u.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
85
+ compact_theta_sketch sketch3 = u.get_result();
86
+ REQUIRE_FALSE(sketch3.is_empty());
87
+ REQUIRE_FALSE(sketch3.is_estimation_mode());
88
+ REQUIRE(sketch3.get_estimate() == 1500.0);
69
89
  }
70
90
 
71
91
  TEST_CASE("theta union: estimation mode half overlap", "[theta_union]") {
@@ -70,33 +70,33 @@ uint8_t compact_array_of_doubles_sketch_alloc<A>::get_num_values() const {
70
70
  template<typename A>
71
71
  void compact_array_of_doubles_sketch_alloc<A>::serialize(std::ostream& os) const {
72
72
  const uint8_t preamble_longs = 1;
73
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
73
+ write(os, preamble_longs);
74
74
  const uint8_t serial_version = SERIAL_VERSION;
75
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
75
+ write(os, serial_version);
76
76
  const uint8_t family = SKETCH_FAMILY;
77
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
77
+ write(os, family);
78
78
  const uint8_t type = SKETCH_TYPE;
79
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
79
+ write(os, type);
80
80
  const uint8_t flags_byte(
81
81
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
82
82
  (this->get_num_retained() > 0 ? 1 << flags::HAS_ENTRIES : 0) |
83
83
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
84
84
  );
85
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
86
- os.write(reinterpret_cast<const char*>(&num_values_), sizeof(num_values_));
85
+ write(os, flags_byte);
86
+ write(os, num_values_);
87
87
  const uint16_t seed_hash = this->get_seed_hash();
88
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
89
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
88
+ write(os, seed_hash);
89
+ write(os, this->theta_);
90
90
  if (this->get_num_retained() > 0) {
91
- const uint32_t num_entries = this->entries_.size();
92
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
91
+ const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
92
+ write(os, num_entries);
93
93
  const uint32_t unused32 = 0;
94
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
94
+ write(os, unused32);
95
95
  for (const auto& it: this->entries_) {
96
- os.write(reinterpret_cast<const char*>(&it.first), sizeof(uint64_t));
96
+ write(os, it.first);
97
97
  }
98
98
  for (const auto& it: this->entries_) {
99
- os.write(reinterpret_cast<const char*>(it.second.data()), it.second.size() * sizeof(double));
99
+ write(os, it.second.data(), it.second.size() * sizeof(double));
100
100
  }
101
101
  }
102
102
  }
@@ -110,30 +110,29 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
110
110
  vector_bytes bytes(size, 0, this->entries_.get_allocator());
111
111
  uint8_t* ptr = bytes.data() + header_size_bytes;
112
112
 
113
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
113
+ ptr += copy_to_mem(preamble_longs, ptr);
114
114
  const uint8_t serial_version = SERIAL_VERSION;
115
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
115
+ ptr += copy_to_mem(serial_version, ptr);
116
116
  const uint8_t family = SKETCH_FAMILY;
117
- ptr += copy_to_mem(&family, ptr, sizeof(family));
117
+ ptr += copy_to_mem(family, ptr);
118
118
  const uint8_t type = SKETCH_TYPE;
119
- ptr += copy_to_mem(&type, ptr, sizeof(type));
119
+ ptr += copy_to_mem(type, ptr);
120
120
  const uint8_t flags_byte(
121
121
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
122
122
  (this->get_num_retained() ? 1 << flags::HAS_ENTRIES : 0) |
123
123
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
124
124
  );
125
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
126
- ptr += copy_to_mem(&num_values_, ptr, sizeof(num_values_));
125
+ ptr += copy_to_mem(flags_byte, ptr);
126
+ ptr += copy_to_mem(num_values_, ptr);
127
127
  const uint16_t seed_hash = this->get_seed_hash();
128
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
129
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
128
+ ptr += copy_to_mem(seed_hash, ptr);
129
+ ptr += copy_to_mem((this->theta_), ptr);
130
130
  if (this->get_num_retained() > 0) {
131
- const uint32_t num_entries = this->entries_.size();
132
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
133
- const uint32_t unused32 = 0;
134
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
131
+ const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
132
+ ptr += copy_to_mem(num_entries, ptr);
133
+ ptr += sizeof(uint32_t); // unused
135
134
  for (const auto& it: this->entries_) {
136
- ptr += copy_to_mem(&it.first, ptr, sizeof(uint64_t));
135
+ ptr += copy_to_mem(it.first, ptr);
137
136
  }
138
137
  for (const auto& it: this->entries_) {
139
138
  ptr += copy_to_mem(it.second.data(), ptr, it.second.size() * sizeof(double));
@@ -144,40 +143,30 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
144
143
 
145
144
  template<typename A>
146
145
  compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
147
- uint8_t preamble_longs;
148
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
149
- uint8_t serial_version;
150
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
151
- uint8_t family;
152
- is.read(reinterpret_cast<char*>(&family), sizeof(family));
153
- uint8_t type;
154
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
155
- uint8_t flags_byte;
156
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
157
- uint8_t num_values;
158
- is.read(reinterpret_cast<char*>(&num_values), sizeof(num_values));
159
- uint16_t seed_hash;
160
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
146
+ read<uint8_t>(is); // unused
147
+ const auto serial_version = read<uint8_t>(is);
148
+ const auto family = read<uint8_t>(is);
149
+ const auto type = read<uint8_t>(is);
150
+ const auto flags_byte = read<uint8_t>(is);
151
+ const auto num_values = read<uint8_t>(is);
152
+ const auto seed_hash = read<uint16_t>(is);
161
153
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
162
154
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
163
155
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
164
156
  const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
165
157
  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
166
158
 
167
- uint64_t theta;
168
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
159
+ const auto theta = read<uint64_t>(is);
169
160
  std::vector<Entry, AllocEntry> entries(allocator);
170
161
  if (has_entries) {
171
- uint32_t num_entries;
172
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
173
- uint32_t unused32;
174
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
162
+ const auto num_entries = read<uint32_t>(is);
163
+ read<uint32_t>(is); // unused
175
164
  entries.reserve(num_entries);
176
165
  std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
177
- is.read(reinterpret_cast<char*>(keys.data()), num_entries * sizeof(uint64_t));
166
+ read(is, keys.data(), num_entries * sizeof(uint64_t));
178
167
  for (size_t i = 0; i < num_entries; ++i) {
179
168
  aod<A> summary(num_values, allocator);
180
- is.read(reinterpret_cast<char*>(summary.data()), num_values * sizeof(double));
169
+ read(is, summary.data(), num_values * sizeof(double));
181
170
  entries.push_back(Entry(keys[i], std::move(summary)));
182
171
  }
183
172
  }
@@ -191,20 +180,19 @@ template<typename A>
191
180
  compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
192
181
  ensure_minimum_memory(size, 16);
193
182
  const char* ptr = static_cast<const char*>(bytes);
194
- uint8_t preamble_longs;
195
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
183
+ ptr += sizeof(uint8_t); // unused
196
184
  uint8_t serial_version;
197
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
185
+ ptr += copy_from_mem(ptr, serial_version);
198
186
  uint8_t family;
199
- ptr += copy_from_mem(ptr, &family, sizeof(family));
187
+ ptr += copy_from_mem(ptr, family);
200
188
  uint8_t type;
201
- ptr += copy_from_mem(ptr, &type, sizeof(type));
189
+ ptr += copy_from_mem(ptr, type);
202
190
  uint8_t flags_byte;
203
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
191
+ ptr += copy_from_mem(ptr, flags_byte);
204
192
  uint8_t num_values;
205
- ptr += copy_from_mem(ptr, &num_values, sizeof(num_values));
193
+ ptr += copy_from_mem(ptr, num_values);
206
194
  uint16_t seed_hash;
207
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
195
+ ptr += copy_from_mem(ptr, seed_hash);
208
196
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
209
197
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
210
198
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
@@ -212,14 +200,13 @@ compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A
212
200
  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
213
201
 
214
202
  uint64_t theta;
215
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
203
+ ptr += copy_from_mem(ptr, theta);
216
204
  std::vector<Entry, AllocEntry> entries(allocator);
217
205
  if (has_entries) {
218
206
  ensure_minimum_memory(size, 24);
219
207
  uint32_t num_entries;
220
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
221
- uint32_t unused32;
222
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
208
+ ptr += copy_from_mem(ptr, num_entries);
209
+ ptr += sizeof(uint32_t); // unused
223
210
  ensure_minimum_memory(size, 24 + (sizeof(uint64_t) + sizeof(double) * num_values) * num_entries);
224
211
  entries.reserve(num_entries);
225
212
  std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);