datasketches 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -35,13 +35,13 @@ public:
35
35
  using CompactSketch = compact_theta_sketch_alloc<Allocator>;
36
36
  using resize_factor = theta_constants::resize_factor;
37
37
 
38
- struct pass_through_policy {
39
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
+ struct nop_policy {
39
+ void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
40
+ unused(internal_entry);
40
41
  unused(incoming_entry);
41
- return internal_entry;
42
42
  }
43
43
  };
44
- using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
44
+ using State = theta_union_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
45
45
 
46
46
  // No constructor here. Use builder instead.
47
47
  class builder;
@@ -43,7 +43,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
43
43
  if (sketch.get_theta64() < union_theta_) union_theta_ = sketch.get_theta64();
44
44
  for (auto& entry: sketch) {
45
45
  const uint64_t hash = EK()(entry);
46
- if (hash < union_theta_) {
46
+ if (hash < union_theta_ && hash < table_.theta_) {
47
47
  auto result = table_.find(hash);
48
48
  if (!result.second) {
49
49
  table_.insert(result.first, conditional_forward<SS>(entry));
@@ -24,7 +24,7 @@ namespace datasketches {
24
24
 
25
25
  template<typename A>
26
26
  theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
27
- state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
27
+ state_(lg_cur_size, lg_nom_size, rf, theta, seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -53,6 +53,8 @@ struct theta_update_sketch_base {
53
53
  inline uint64_t hash_and_screen(const void* data, size_t length);
54
54
 
55
55
  inline std::pair<iterator, bool> find(uint64_t key) const;
56
+ static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
57
+
56
58
 
57
59
  template<typename FwdEntry>
58
60
  inline void insert(iterator it, FwdEntry&& entry);
@@ -39,7 +39,7 @@ seed_(seed),
39
39
  entries_(nullptr)
40
40
  {
41
41
  if (lg_cur_size > 0) {
42
- const size_t size = 1 << lg_cur_size;
42
+ const size_t size = 1ULL << lg_cur_size;
43
43
  entries_ = allocator_.allocate(size);
44
44
  for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
45
45
  }
@@ -58,7 +58,7 @@ seed_(other.seed_),
58
58
  entries_(nullptr)
59
59
  {
60
60
  if (other.entries_ != nullptr) {
61
- const size_t size = 1 << lg_cur_size_;
61
+ const size_t size = 1ULL << lg_cur_size_;
62
62
  entries_ = allocator_.allocate(size);
63
63
  for (size_t i = 0; i < size; ++i) {
64
64
  if (EK()(other.entries_[i]) != 0) {
@@ -89,7 +89,7 @@ template<typename EN, typename EK, typename A>
89
89
  theta_update_sketch_base<EN, EK, A>::~theta_update_sketch_base()
90
90
  {
91
91
  if (entries_ != nullptr) {
92
- const size_t size = 1 << lg_cur_size_;
92
+ const size_t size = 1ULL << lg_cur_size_;
93
93
  for (size_t i = 0; i < size; ++i) {
94
94
  if (EK()(entries_[i]) != 0) entries_[i].~EN();
95
95
  }
@@ -136,18 +136,23 @@ uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data,
136
136
 
137
137
  template<typename EN, typename EK, typename A>
138
138
  auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
139
- const size_t size = 1 << lg_cur_size_;
140
- const size_t mask = size - 1;
141
- const uint32_t stride = get_stride(key, lg_cur_size_);
139
+ return find(entries_, lg_cur_size_, key);
140
+ }
141
+
142
+ template<typename EN, typename EK, typename A>
143
+ auto theta_update_sketch_base<EN, EK, A>::find(EN* entries, uint8_t lg_size, uint64_t key) -> std::pair<iterator, bool> {
144
+ const uint32_t size = 1 << lg_size;
145
+ const uint32_t mask = size - 1;
146
+ const uint32_t stride = get_stride(key, lg_size);
142
147
  uint32_t index = static_cast<uint32_t>(key) & mask;
143
148
  // search for duplicate or zero
144
149
  const uint32_t loop_index = index;
145
150
  do {
146
- const uint64_t probe = EK()(entries_[index]);
151
+ const uint64_t probe = EK()(entries[index]);
147
152
  if (probe == 0) {
148
- return std::pair<iterator, bool>(&entries_[index], false);
153
+ return std::pair<iterator, bool>(&entries[index], false);
149
154
  } else if (probe == key) {
150
- return std::pair<iterator, bool>(&entries_[index], true);
155
+ return std::pair<iterator, bool>(&entries[index], true);
151
156
  }
152
157
  index = (index + stride) & mask;
153
158
  } while (index != loop_index);
@@ -175,13 +180,13 @@ auto theta_update_sketch_base<EN, EK, A>::begin() const -> iterator {
175
180
 
176
181
  template<typename EN, typename EK, typename A>
177
182
  auto theta_update_sketch_base<EN, EK, A>::end() const -> iterator {
178
- return &entries_[1 << lg_cur_size_];
183
+ return &entries_[1ULL << lg_cur_size_];
179
184
  }
180
185
 
181
186
  template<typename EN, typename EK, typename A>
182
187
  uint32_t theta_update_sketch_base<EN, EK, A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
183
188
  const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
184
- return std::floor(fraction * (1 << lg_cur_size));
189
+ return static_cast<uint32_t>(std::floor(fraction * (1 << lg_cur_size)));
185
190
  }
186
191
 
187
192
  template<typename EN, typename EK, typename A>
@@ -192,29 +197,29 @@ uint32_t theta_update_sketch_base<EN, EK, A>::get_stride(uint64_t key, uint8_t l
192
197
 
193
198
  template<typename EN, typename EK, typename A>
194
199
  void theta_update_sketch_base<EN, EK, A>::resize() {
195
- const size_t old_size = 1 << lg_cur_size_;
196
- const uint8_t lg_tgt_size = lg_nom_size_ + 1;
197
- const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
198
- lg_cur_size_ += factor;
199
- const size_t new_size = 1 << lg_cur_size_;
200
- EN* old_entries = entries_;
201
- entries_ = allocator_.allocate(new_size);
202
- for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
203
- num_entries_ = 0;
200
+ const size_t old_size = 1ULL << lg_cur_size_;
201
+ const uint8_t lg_new_size = std::min<uint8_t>(lg_cur_size_ + static_cast<uint8_t>(rf_), lg_nom_size_ + 1);
202
+ const size_t new_size = 1ULL << lg_new_size;
203
+ EN* new_entries = allocator_.allocate(new_size);
204
+ for (size_t i = 0; i < new_size; ++i) EK()(new_entries[i]) = 0;
204
205
  for (size_t i = 0; i < old_size; ++i) {
205
- const uint64_t key = EK()(old_entries[i]);
206
+ const uint64_t key = EK()(entries_[i]);
206
207
  if (key != 0) {
207
- insert(find(key).first, std::move(old_entries[i])); // consider a special insert with no comparison
208
- old_entries[i].~EN();
208
+ // always finds an empty slot in a larger table
209
+ new (find(new_entries, lg_new_size, key).first) EN(std::move(entries_[i]));
210
+ entries_[i].~EN();
211
+ EK()(entries_[i]) = 0;
209
212
  }
210
213
  }
211
- allocator_.deallocate(old_entries, old_size);
214
+ std::swap(entries_, new_entries);
215
+ lg_cur_size_ = lg_new_size;
216
+ allocator_.deallocate(new_entries, old_size);
212
217
  }
213
218
 
214
219
  // assumes number of entries > nominal size
215
220
  template<typename EN, typename EK, typename A>
216
221
  void theta_update_sketch_base<EN, EK, A>::rebuild() {
217
- const size_t size = 1 << lg_cur_size_;
222
+ const size_t size = 1ULL << lg_cur_size_;
218
223
  const uint32_t nominal_size = 1 << lg_nom_size_;
219
224
 
220
225
  // empty entries have uninitialized payloads
@@ -227,10 +232,10 @@ void theta_update_sketch_base<EN, EK, A>::rebuild() {
227
232
  const size_t num_old_entries = num_entries_;
228
233
  entries_ = allocator_.allocate(size);
229
234
  for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
230
- num_entries_ = 0;
235
+ num_entries_ = nominal_size;
231
236
  // relies on consolidating non-empty entries to the front
232
237
  for (size_t i = 0; i < nominal_size; ++i) {
233
- insert(find(EK()(old_entries[i])).first, std::move(old_entries[i])); // consider a special insert with no comparison
238
+ new (find(EK()(old_entries[i])).first) EN(std::move(old_entries[i]));
234
239
  old_entries[i].~EN();
235
240
  }
236
241
  for (size_t i = nominal_size; i < num_old_entries; ++i) old_entries[i].~EN();
@@ -301,7 +306,7 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
301
306
 
302
307
  template<typename Derived, typename Allocator>
303
308
  uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
304
- if (p_ < 1) return theta_constants::MAX_THETA * p_;
309
+ if (p_ < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p_);
305
310
  return theta_constants::MAX_THETA;
306
311
  }
307
312
 
@@ -37,7 +37,7 @@ TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
37
37
  TEST_CASE("theta a-not-b: non empty no retained keys", "[theta_a_not_b]") {
38
38
  update_theta_sketch a = update_theta_sketch::builder().build();
39
39
  a.update(1);
40
- update_theta_sketch b = update_theta_sketch::builder().set_p(0.001).build();
40
+ update_theta_sketch b = update_theta_sketch::builder().set_p(0.001f).build();
41
41
  theta_a_not_b a_not_b;
42
42
 
43
43
  // B is still empty
@@ -167,6 +167,28 @@ TEST_CASE("theta a-not-b: estimation mode half overlap", "[theta_a_not_b]") {
167
167
  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
168
168
  }
169
169
 
170
+ TEST_CASE("theta a-not-b: estimation mode half overlap wrapped compact", "[theta_a_not_b]") {
171
+ update_theta_sketch a = update_theta_sketch::builder().build();
172
+ int value = 0;
173
+ for (int i = 0; i < 10000; i++) a.update(value++);
174
+ auto bytes_a = a.compact().serialize();
175
+
176
+ update_theta_sketch b = update_theta_sketch::builder().build();
177
+ value = 5000;
178
+ for (int i = 0; i < 10000; i++) b.update(value++);
179
+ auto bytes_b = b.compact().serialize();
180
+
181
+ theta_a_not_b a_not_b;
182
+
183
+ auto result = a_not_b.compute(
184
+ wrapped_compact_theta_sketch::wrap(bytes_a.data(), bytes_a.size()),
185
+ wrapped_compact_theta_sketch::wrap(bytes_b.data(), bytes_b.size())
186
+ );
187
+ REQUIRE_FALSE(result.is_empty());
188
+ REQUIRE(result.is_estimation_mode());
189
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
190
+ }
191
+
170
192
  TEST_CASE("theta a-not-b: estimation mode disjoint", "[theta_a_not_b]") {
171
193
  update_theta_sketch a = update_theta_sketch::builder().build();
172
194
  int value = 0;
@@ -48,7 +48,7 @@ TEST_CASE("theta intersection: empty", "[theta_intersection]") {
48
48
  }
49
49
 
50
50
  TEST_CASE("theta intersection: non empty no retained keys", "[theta_intersection]") {
51
- update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001).build();
51
+ update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001f).build();
52
52
  sketch.update(1);
53
53
  theta_intersection intersection;
54
54
  intersection.update(sketch);
@@ -174,6 +174,26 @@ TEST_CASE("theta intersection: estimation mode half overlap ordered", "[theta_in
174
174
  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
175
175
  }
176
176
 
177
+ TEST_CASE("theta intersection: estimation mode half overlap ordered wrapped compact", "[theta_intersection]") {
178
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
179
+ int value = 0;
180
+ for (int i = 0; i < 10000; i++) sketch1.update(value++);
181
+ auto bytes1 = sketch1.compact().serialize();
182
+
183
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
184
+ value = 5000;
185
+ for (int i = 0; i < 10000; i++) sketch2.update(value++);
186
+ auto bytes2 = sketch2.compact().serialize();
187
+
188
+ theta_intersection intersection;
189
+ intersection.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
190
+ intersection.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
191
+ compact_theta_sketch result = intersection.get_result();
192
+ REQUIRE_FALSE(result.is_empty());
193
+ REQUIRE(result.is_estimation_mode());
194
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
195
+ }
196
+
177
197
  TEST_CASE("theta intersection: estimation mode disjoint unordered", "[theta_intersection]") {
178
198
  update_theta_sketch sketch1 = update_theta_sketch::builder().build();
179
199
  int value = 0;
@@ -100,6 +100,28 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
100
100
  REQUIRE(jc[2] == Approx(0.33).margin(0.01));
101
101
  }
102
102
 
103
+ TEST_CASE("theta jaccard: half overlap estimation mode custom seed", "[theta_sketch]") {
104
+ const uint64_t seed = 123;
105
+ auto sk_a = update_theta_sketch::builder().set_seed(seed).build();
106
+ auto sk_b = update_theta_sketch::builder().set_seed(seed).build();
107
+ for (int i = 0; i < 10000; ++i) {
108
+ sk_a.update(i);
109
+ sk_b.update(i + 5000);
110
+ }
111
+
112
+ // update sketches
113
+ auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b, seed);
114
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
115
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
116
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
117
+
118
+ // compact sketches
119
+ jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact(), seed);
120
+ REQUIRE(jc[0] == Approx(0.33).margin(0.01));
121
+ REQUIRE(jc[1] == Approx(0.33).margin(0.01));
122
+ REQUIRE(jc[2] == Approx(0.33).margin(0.01));
123
+ }
124
+
103
125
  /**
104
126
  * The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
105
127
  * underlying sketch is about +/- 1.56%.
@@ -107,7 +129,7 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
107
129
  TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
108
130
  const int8_t min_lg_k = 12;
109
131
  const int u1 = 1 << 20;
110
- const int u2 = u1 * 0.95;
132
+ const int u2 = static_cast<int>(u1 * 0.95);
111
133
  const double threshold = 0.943;
112
134
 
113
135
  auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
@@ -120,6 +142,23 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
120
142
  REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
121
143
  }
122
144
 
145
+ TEST_CASE("theta jaccard: similarity test custom seed", "[theta_sketch]") {
146
+ const int8_t min_lg_k = 12;
147
+ const int u1 = 1 << 20;
148
+ const int u2 = static_cast<int>(u1 * 0.95);
149
+ const double threshold = 0.943;
150
+ const uint64_t seed = 1234;
151
+
152
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
153
+ for (int i = 0; i < u1; ++i) expected.update(i);
154
+
155
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
156
+ for (int i = 0; i < u2; ++i) actual.update(i);
157
+
158
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold, seed));
159
+ REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold, seed));
160
+ }
161
+
123
162
  /**
124
163
  * The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
125
164
  * as the ratio of intersection to the union becomes a small number.
@@ -127,7 +166,7 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
127
166
  TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
128
167
  const int8_t min_lg_k = 12;
129
168
  const int u1 = 1 << 20;
130
- const int u2 = u1 * 0.05;
169
+ const int u2 = static_cast<int>(u1 * 0.05);
131
170
  const double threshold = 0.061;
132
171
 
133
172
  auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
@@ -140,4 +179,21 @@ TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
140
179
  REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
141
180
  }
142
181
 
182
+ TEST_CASE("theta jaccard: dissimilarity test custom seed", "[theta_sketch]") {
183
+ const int8_t min_lg_k = 12;
184
+ const int u1 = 1 << 20;
185
+ const int u2 = static_cast<int>(u1 * 0.05);
186
+ const double threshold = 0.061;
187
+ const uint64_t seed = 1234;
188
+
189
+ auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
190
+ for (int i = 0; i < u1; ++i) expected.update(i);
191
+
192
+ auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
193
+ for (int i = 0; i < u2; ++i) actual.update(i);
194
+
195
+ REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold, seed));
196
+ REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold, seed));
197
+ }
198
+
143
199
  } /* namespace datasketches */
@@ -50,7 +50,7 @@ TEST_CASE("theta sketch: empty", "[theta_sketch]") {
50
50
  }
51
51
 
52
52
  TEST_CASE("theta sketch: non empty no retained keys", "[theta_sketch]") {
53
- update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001).build();
53
+ update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001f).build();
54
54
  update_sketch.update(1);
55
55
  //std::cerr << update_sketch.to_string();
56
56
  REQUIRE(update_sketch.get_num_retained() == 0);
@@ -238,4 +238,40 @@ TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[thet
238
238
  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
239
239
  }
240
240
 
241
+ TEST_CASE("theta sketch: conversion constructor and wrapped compact", "[theta_sketch]") {
242
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
243
+ const int n = 8192;
244
+ for (int i = 0; i < n; i++) update_sketch.update(i);
245
+
246
+ // unordered
247
+ auto unordered_compact1 = update_sketch.compact(false);
248
+ compact_theta_sketch unordered_compact2(update_sketch, false);
249
+ auto it = unordered_compact1.begin();
250
+ for (auto entry: unordered_compact2) {
251
+ REQUIRE(*it == entry);
252
+ ++it;
253
+ }
254
+
255
+ // ordered
256
+ auto ordered_compact1 = update_sketch.compact();
257
+ compact_theta_sketch ordered_compact2(update_sketch, true);
258
+ it = ordered_compact1.begin();
259
+ for (auto entry: ordered_compact2) {
260
+ REQUIRE(*it == entry);
261
+ ++it;
262
+ }
263
+
264
+ // wrapped compact
265
+ auto bytes = ordered_compact1.serialize();
266
+ auto ordered_compact3 = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
267
+ it = ordered_compact1.begin();
268
+ for (auto entry: ordered_compact3) {
269
+ REQUIRE(*it == entry);
270
+ ++it;
271
+ }
272
+
273
+ // seed mismatch
274
+ REQUIRE_THROWS_AS(wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), 0), std::invalid_argument);
275
+ }
276
+
241
277
  } /* namespace datasketches */
@@ -39,7 +39,7 @@ TEST_CASE("theta union: empty", "[theta_union]") {
39
39
  }
40
40
 
41
41
  TEST_CASE("theta union: non empty no retained keys", "[theta_union]") {
42
- update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001).build();
42
+ update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001f).build();
43
43
  update_sketch.update(1);
44
44
  theta_union u = theta_union::builder().build();
45
45
  u.update(update_sketch);
@@ -65,7 +65,27 @@ TEST_CASE("theta union: exact mode half overlap", "[theta_union]") {
65
65
  compact_theta_sketch sketch3 = u.get_result();
66
66
  REQUIRE_FALSE(sketch3.is_empty());
67
67
  REQUIRE_FALSE(sketch3.is_estimation_mode());
68
- REQUIRE(sketch3.get_estimate() == Approx(1500).margin(1500 * 0.01));
68
+ REQUIRE(sketch3.get_estimate() == 1500.0);
69
+ }
70
+
71
+ TEST_CASE("theta union: exact mode half overlap wrapped compact", "[theta_union]") {
72
+ update_theta_sketch sketch1 = update_theta_sketch::builder().build();
73
+ int value = 0;
74
+ for (int i = 0; i < 1000; i++) sketch1.update(value++);
75
+ auto bytes1 = sketch1.compact().serialize();
76
+
77
+ update_theta_sketch sketch2 = update_theta_sketch::builder().build();
78
+ value = 500;
79
+ for (int i = 0; i < 1000; i++) sketch2.update(value++);
80
+ auto bytes2 = sketch2.compact().serialize();
81
+
82
+ theta_union u = theta_union::builder().build();
83
+ u.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
84
+ u.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
85
+ compact_theta_sketch sketch3 = u.get_result();
86
+ REQUIRE_FALSE(sketch3.is_empty());
87
+ REQUIRE_FALSE(sketch3.is_estimation_mode());
88
+ REQUIRE(sketch3.get_estimate() == 1500.0);
69
89
  }
70
90
 
71
91
  TEST_CASE("theta union: estimation mode half overlap", "[theta_union]") {
@@ -70,33 +70,33 @@ uint8_t compact_array_of_doubles_sketch_alloc<A>::get_num_values() const {
70
70
  template<typename A>
71
71
  void compact_array_of_doubles_sketch_alloc<A>::serialize(std::ostream& os) const {
72
72
  const uint8_t preamble_longs = 1;
73
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
73
+ write(os, preamble_longs);
74
74
  const uint8_t serial_version = SERIAL_VERSION;
75
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
75
+ write(os, serial_version);
76
76
  const uint8_t family = SKETCH_FAMILY;
77
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
77
+ write(os, family);
78
78
  const uint8_t type = SKETCH_TYPE;
79
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
79
+ write(os, type);
80
80
  const uint8_t flags_byte(
81
81
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
82
82
  (this->get_num_retained() > 0 ? 1 << flags::HAS_ENTRIES : 0) |
83
83
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
84
84
  );
85
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
86
- os.write(reinterpret_cast<const char*>(&num_values_), sizeof(num_values_));
85
+ write(os, flags_byte);
86
+ write(os, num_values_);
87
87
  const uint16_t seed_hash = this->get_seed_hash();
88
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
89
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
88
+ write(os, seed_hash);
89
+ write(os, this->theta_);
90
90
  if (this->get_num_retained() > 0) {
91
- const uint32_t num_entries = this->entries_.size();
92
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
91
+ const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
92
+ write(os, num_entries);
93
93
  const uint32_t unused32 = 0;
94
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
94
+ write(os, unused32);
95
95
  for (const auto& it: this->entries_) {
96
- os.write(reinterpret_cast<const char*>(&it.first), sizeof(uint64_t));
96
+ write(os, it.first);
97
97
  }
98
98
  for (const auto& it: this->entries_) {
99
- os.write(reinterpret_cast<const char*>(it.second.data()), it.second.size() * sizeof(double));
99
+ write(os, it.second.data(), it.second.size() * sizeof(double));
100
100
  }
101
101
  }
102
102
  }
@@ -110,30 +110,29 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
110
110
  vector_bytes bytes(size, 0, this->entries_.get_allocator());
111
111
  uint8_t* ptr = bytes.data() + header_size_bytes;
112
112
 
113
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
113
+ ptr += copy_to_mem(preamble_longs, ptr);
114
114
  const uint8_t serial_version = SERIAL_VERSION;
115
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
115
+ ptr += copy_to_mem(serial_version, ptr);
116
116
  const uint8_t family = SKETCH_FAMILY;
117
- ptr += copy_to_mem(&family, ptr, sizeof(family));
117
+ ptr += copy_to_mem(family, ptr);
118
118
  const uint8_t type = SKETCH_TYPE;
119
- ptr += copy_to_mem(&type, ptr, sizeof(type));
119
+ ptr += copy_to_mem(type, ptr);
120
120
  const uint8_t flags_byte(
121
121
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
122
122
  (this->get_num_retained() ? 1 << flags::HAS_ENTRIES : 0) |
123
123
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
124
124
  );
125
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
126
- ptr += copy_to_mem(&num_values_, ptr, sizeof(num_values_));
125
+ ptr += copy_to_mem(flags_byte, ptr);
126
+ ptr += copy_to_mem(num_values_, ptr);
127
127
  const uint16_t seed_hash = this->get_seed_hash();
128
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
129
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
128
+ ptr += copy_to_mem(seed_hash, ptr);
129
+ ptr += copy_to_mem((this->theta_), ptr);
130
130
  if (this->get_num_retained() > 0) {
131
- const uint32_t num_entries = this->entries_.size();
132
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
133
- const uint32_t unused32 = 0;
134
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
131
+ const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
132
+ ptr += copy_to_mem(num_entries, ptr);
133
+ ptr += sizeof(uint32_t); // unused
135
134
  for (const auto& it: this->entries_) {
136
- ptr += copy_to_mem(&it.first, ptr, sizeof(uint64_t));
135
+ ptr += copy_to_mem(it.first, ptr);
137
136
  }
138
137
  for (const auto& it: this->entries_) {
139
138
  ptr += copy_to_mem(it.second.data(), ptr, it.second.size() * sizeof(double));
@@ -144,40 +143,30 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
144
143
 
145
144
  template<typename A>
146
145
  compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
147
- uint8_t preamble_longs;
148
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
149
- uint8_t serial_version;
150
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
151
- uint8_t family;
152
- is.read(reinterpret_cast<char*>(&family), sizeof(family));
153
- uint8_t type;
154
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
155
- uint8_t flags_byte;
156
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
157
- uint8_t num_values;
158
- is.read(reinterpret_cast<char*>(&num_values), sizeof(num_values));
159
- uint16_t seed_hash;
160
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
146
+ read<uint8_t>(is); // unused
147
+ const auto serial_version = read<uint8_t>(is);
148
+ const auto family = read<uint8_t>(is);
149
+ const auto type = read<uint8_t>(is);
150
+ const auto flags_byte = read<uint8_t>(is);
151
+ const auto num_values = read<uint8_t>(is);
152
+ const auto seed_hash = read<uint16_t>(is);
161
153
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
162
154
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
163
155
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
164
156
  const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
165
157
  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
166
158
 
167
- uint64_t theta;
168
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
159
+ const auto theta = read<uint64_t>(is);
169
160
  std::vector<Entry, AllocEntry> entries(allocator);
170
161
  if (has_entries) {
171
- uint32_t num_entries;
172
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
173
- uint32_t unused32;
174
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
162
+ const auto num_entries = read<uint32_t>(is);
163
+ read<uint32_t>(is); // unused
175
164
  entries.reserve(num_entries);
176
165
  std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
177
- is.read(reinterpret_cast<char*>(keys.data()), num_entries * sizeof(uint64_t));
166
+ read(is, keys.data(), num_entries * sizeof(uint64_t));
178
167
  for (size_t i = 0; i < num_entries; ++i) {
179
168
  aod<A> summary(num_values, allocator);
180
- is.read(reinterpret_cast<char*>(summary.data()), num_values * sizeof(double));
169
+ read(is, summary.data(), num_values * sizeof(double));
181
170
  entries.push_back(Entry(keys[i], std::move(summary)));
182
171
  }
183
172
  }
@@ -191,20 +180,19 @@ template<typename A>
191
180
  compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
192
181
  ensure_minimum_memory(size, 16);
193
182
  const char* ptr = static_cast<const char*>(bytes);
194
- uint8_t preamble_longs;
195
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
183
+ ptr += sizeof(uint8_t); // unused
196
184
  uint8_t serial_version;
197
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
185
+ ptr += copy_from_mem(ptr, serial_version);
198
186
  uint8_t family;
199
- ptr += copy_from_mem(ptr, &family, sizeof(family));
187
+ ptr += copy_from_mem(ptr, family);
200
188
  uint8_t type;
201
- ptr += copy_from_mem(ptr, &type, sizeof(type));
189
+ ptr += copy_from_mem(ptr, type);
202
190
  uint8_t flags_byte;
203
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
191
+ ptr += copy_from_mem(ptr, flags_byte);
204
192
  uint8_t num_values;
205
- ptr += copy_from_mem(ptr, &num_values, sizeof(num_values));
193
+ ptr += copy_from_mem(ptr, num_values);
206
194
  uint16_t seed_hash;
207
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
195
+ ptr += copy_from_mem(ptr, seed_hash);
208
196
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
209
197
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
210
198
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
@@ -212,14 +200,13 @@ compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A
212
200
  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
213
201
 
214
202
  uint64_t theta;
215
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
203
+ ptr += copy_from_mem(ptr, theta);
216
204
  std::vector<Entry, AllocEntry> entries(allocator);
217
205
  if (has_entries) {
218
206
  ensure_minimum_memory(size, 24);
219
207
  uint32_t num_entries;
220
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
221
- uint32_t unused32;
222
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
208
+ ptr += copy_from_mem(ptr, num_entries);
209
+ ptr += sizeof(uint32_t); // unused
223
210
  ensure_minimum_memory(size, 24 + (sizeof(uint64_t) + sizeof(double) * num_values) * num_entries);
224
211
  entries.reserve(num_entries);
225
212
  std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);