datasketches 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -51,7 +51,7 @@ class var_opt_union {
51
51
  public:
52
52
  static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
53
53
 
54
- explicit var_opt_union(uint32_t max_k);
54
+ explicit var_opt_union(uint32_t max_k, const A& allocator = A());
55
55
  var_opt_union(const var_opt_union& other);
56
56
  var_opt_union(var_opt_union&& other) noexcept;
57
57
 
@@ -119,16 +119,16 @@ public:
119
119
  * @param is input stream
120
120
  * @return an instance of a union
121
121
  */
122
- static var_opt_union deserialize(std::istream& is);
122
+ static var_opt_union deserialize(std::istream& is, const A& allocator = A());
123
123
 
124
124
  /**
125
125
  * NOTE: This method may be deprecated in a future version.
126
- * This method deserializes a skeuniontch from a given array of bytes.
126
+ * This method deserializes a union from a given array of bytes.
127
127
  * @param bytes pointer to the array of bytes
128
128
  * @param size the size of the array
129
129
  * @return an instance of a union
130
130
  */
131
- static var_opt_union deserialize(const void* bytes, size_t size);
131
+ static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
132
132
 
133
133
  /**
134
134
  * Prints a summary of the union as a string.
@@ -236,4 +236,4 @@ private:
236
236
 
237
237
  #include "var_opt_union_impl.hpp"
238
238
 
239
- #endif // _VAR_OPT_UNION_HPP_
239
+ #endif // _VAR_OPT_UNION_HPP_
@@ -28,12 +28,12 @@
28
28
  namespace datasketches {
29
29
 
30
30
  template<typename T, typename S, typename A>
31
- var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
31
+ var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
32
32
  n_(0),
33
33
  outer_tau_numer_(0),
34
34
  outer_tau_denom_(0.0),
35
35
  max_k_(max_k),
36
- gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
36
+ gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
37
37
  {}
38
38
 
39
39
  template<typename T, typename S, typename A>
@@ -128,7 +128,7 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
128
128
  */
129
129
 
130
130
  template<typename T, typename S, typename A>
131
- var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
131
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
132
132
  uint8_t preamble_longs;
133
133
  is.read((char*)&preamble_longs, sizeof(preamble_longs));
134
134
  uint8_t serial_version;
@@ -163,7 +163,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
163
163
  uint64_t outer_tau_denom;
164
164
  is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
165
165
 
166
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is);
166
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
167
167
 
168
168
  if (!is.good())
169
169
  throw std::runtime_error("error reading from std::istream");
@@ -172,7 +172,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
172
172
  }
173
173
 
174
174
  template<typename T, typename S, typename A>
175
- var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size) {
175
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
176
176
  ensure_minimum_memory(size, 8);
177
177
  const char* ptr = static_cast<const char*>(bytes);
178
178
  uint8_t preamble_longs;
@@ -207,7 +207,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
207
207
  ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
208
208
 
209
209
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
210
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size);
210
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
211
211
 
212
212
  return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
213
213
  }
@@ -255,7 +255,7 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
255
255
  template<typename T, typename S, typename A>
256
256
  std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
257
257
  const size_t size = header_size_bytes + get_serialized_size_bytes();
258
- std::vector<uint8_t, AllocU8<A>> bytes(size);
258
+ std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
259
259
  uint8_t* ptr = bytes.data() + header_size_bytes;
260
260
 
261
261
  const bool empty = n_ == 0;
@@ -40,4 +40,5 @@ target_sources(sampling_test
40
40
  PRIVATE
41
41
  var_opt_sketch_test.cpp
42
42
  var_opt_union_test.cpp
43
+ var_opt_allocation_test.cpp
43
44
  )
@@ -0,0 +1,96 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <var_opt_sketch.hpp>
21
+ #include <var_opt_union.hpp>
22
+ #include <test_type.hpp>
23
+ #include <test_allocator.hpp>
24
+
25
+ #include <catch.hpp>
26
+
27
+ #include <sstream>
28
+
29
+ namespace datasketches {
30
+
31
+ using var_opt_test_sketch = var_opt_sketch<test_type, test_type_serde, test_allocator<test_type>>;
32
+ using var_opt_test_union = var_opt_union<test_type, test_type_serde, test_allocator<test_type>>;
33
+ using alloc = test_allocator<test_type>;
34
+
35
+ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
36
+ test_allocator_total_bytes = 0;
37
+ test_allocator_net_allocations = 0;
38
+ {
39
+ var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
40
+ for (int i = 0; i < 100; ++i) sk1.update(i);
41
+ auto bytes1 = sk1.serialize();
42
+ auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), 0);
43
+
44
+ std::stringstream ss;
45
+ sk1.serialize(ss);
46
+ auto sk3 = var_opt_test_sketch::deserialize(ss, alloc(0));
47
+
48
+ var_opt_test_union u1(10, 0);
49
+ u1.update(sk1);
50
+ u1.update(sk2);
51
+ u1.update(sk3);
52
+
53
+ auto bytes2 = u1.serialize();
54
+ auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), 0);
55
+ }
56
+ REQUIRE(test_allocator_total_bytes == 0);
57
+ REQUIRE(test_allocator_net_allocations == 0);
58
+ }
59
+
60
+ TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
61
+ test_allocator_total_bytes = 0;
62
+ test_allocator_net_allocations = 0;
63
+ {
64
+ uint32_t n = 20;
65
+ uint32_t k = 5;
66
+ var_opt_test_union u(k, 0);
67
+ var_opt_test_sketch sk1(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
68
+ var_opt_test_sketch sk2(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
69
+
70
+ // move udpates
71
+ for (int i = 0; i < (int) n; ++i) {
72
+ sk1.update(i);
73
+ sk2.update(-i);
74
+ }
75
+ REQUIRE(sk1.get_n() == n);
76
+ REQUIRE(sk2.get_n() == n);
77
+
78
+ // move unions
79
+ u.update(std::move(sk2));
80
+ u.update(std::move(sk1));
81
+ REQUIRE(u.get_result().get_n() == 2 * n);
82
+
83
+ // move constructor
84
+ var_opt_test_union u2(std::move(u));
85
+ REQUIRE(u2.get_result().get_n() == 2 * n);
86
+
87
+ // move assignment
88
+ var_opt_test_union u3(k, 0);
89
+ u3 = std::move(u2);
90
+ REQUIRE(u3.get_result().get_n() == 2 * n);
91
+ }
92
+ REQUIRE(test_allocator_total_bytes == 0);
93
+ REQUIRE(test_allocator_net_allocations == 0);
94
+ }
95
+
96
+ }
@@ -18,7 +18,6 @@
18
18
  */
19
19
 
20
20
  #include <var_opt_union.hpp>
21
- #include "test_type.hpp"
22
21
 
23
22
  #include <catch.hpp>
24
23
 
@@ -325,34 +324,4 @@ TEST_CASE("varopt union: deserialize from java", "[var_opt_union]") {
325
324
  REQUIRE(result.get_k() < 128);
326
325
  }
327
326
 
328
- TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
329
- uint32_t n = 20;
330
- uint32_t k = 5;
331
- var_opt_union<test_type> u(k);
332
- var_opt_sketch<test_type> sk1(k);
333
- var_opt_sketch<test_type> sk2(k);
334
-
335
- // move udpates
336
- for (int i = 0; i < (int) n; ++i) {
337
- sk1.update(i);
338
- sk2.update(-i);
339
- }
340
- REQUIRE(sk1.get_n() == n);
341
- REQUIRE(sk2.get_n() == n);
342
-
343
- // move unions
344
- u.update(std::move(sk2));
345
- u.update(std::move(sk1));
346
- REQUIRE(u.get_result().get_n() == 2 * n);
347
-
348
- // move constructor
349
- var_opt_union<test_type> u2(std::move(u));
350
- REQUIRE(u2.get_result().get_n() == 2 * n);
351
-
352
- // move assignment
353
- var_opt_union<test_type> u3(k);
354
- u3 = std::move(u2);
355
- REQUIRE(u3.get_result().get_n() == 2 * n);
356
- }
357
-
358
327
  }
@@ -49,6 +49,8 @@ class CMakeBuild(build_ext):
49
49
  os.path.dirname(self.get_ext_fullpath(ext.name)))
50
50
  cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir]
51
51
  cmake_args += ['-DWITH_PYTHON=True']
52
+ # ensure we use a consistent python version
53
+ cmake_args += ['-DPYTHON_EXECUTABLE=' + sys.executable]
52
54
  cfg = 'Debug' if self.debug else 'Release'
53
55
  build_args = ['--config', cfg]
54
56
 
@@ -77,10 +79,10 @@ class CMakeBuild(build_ext):
77
79
 
78
80
  setup(
79
81
  name='datasketches',
80
- version='2.2.0-SNAPSHOT',
81
- author='Datasketches Developers',
82
+ version='3.0.0',
83
+ author='Apache DataSketches Developers',
82
84
  author_email='dev@datasketches.apache.org',
83
- description='A wrapper for the C++ Datasketches library',
85
+ description='A wrapper for the C++ Apache DataSketches library',
84
86
  license='Apache License 2.0',
85
87
  url='http://datasketches.apache.org',
86
88
  long_description=open('python/README.md').read(),
@@ -33,9 +33,21 @@ target_link_libraries(theta INTERFACE common)
33
33
  target_compile_features(theta INTERFACE cxx_std_11)
34
34
 
35
35
  set(theta_HEADERS "")
36
- list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
37
- list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_sketch_impl.hpp")
38
- list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
36
+ list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
37
+ list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
38
+ list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
39
+ list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
40
+ list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
41
+ list(APPEND theta_HEADERS "include/theta_comparators.hpp")
42
+ list(APPEND theta_HEADERS "include/theta_constants.hpp")
43
+ list(APPEND theta_HEADERS "include/theta_helpers.hpp")
44
+ list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
45
+ list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
46
+ list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
47
+ list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
48
+ list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
49
+ list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
50
+ list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
39
51
 
40
52
  install(TARGETS theta
41
53
  EXPORT ${PROJECT_NAME}
@@ -54,4 +66,19 @@ target_sources(theta
54
66
  ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
55
67
  ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
56
68
  ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
69
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
70
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
71
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
72
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
73
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
74
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
75
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
76
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
77
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
78
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
79
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
80
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
81
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
82
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
83
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
57
84
  )
@@ -21,8 +21,9 @@
21
21
  #define BOUNDS_ON_RATIOS_IN_SAMPLED_SETS_HPP_
22
22
 
23
23
  #include <cstdint>
24
+ #include <string>
24
25
 
25
- #include <bounds_binomial_proportions.hpp>
26
+ #include "bounds_binomial_proportions.hpp"
26
27
 
27
28
  namespace datasketches {
28
29
 
@@ -23,7 +23,7 @@
23
23
  #include <cstdint>
24
24
  #include <stdexcept>
25
25
 
26
- #include <bounds_on_ratios_in_sampled_sets.hpp>
26
+ #include "bounds_on_ratios_in_sampled_sets.hpp"
27
27
 
28
28
  namespace datasketches {
29
29
 
@@ -20,51 +20,34 @@
20
20
  #ifndef THETA_A_NOT_B_HPP_
21
21
  #define THETA_A_NOT_B_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
-
27
23
  #include "theta_sketch.hpp"
28
- #include "common_defs.hpp"
24
+ #include "theta_set_difference_base.hpp"
29
25
 
30
26
  namespace datasketches {
31
27
 
32
- /*
33
- * author Alexander Saydakov
34
- * author Lee Rhodes
35
- * author Kevin Lang
36
- */
37
-
38
- template<typename A>
28
+ template<typename Allocator = std::allocator<uint64_t>>
39
29
  class theta_a_not_b_alloc {
40
30
  public:
41
- /**
42
- * Creates an instance of the a-not-b operation (set difference) with a given has seed.
43
- * @param seed hash seed
44
- */
45
- explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED);
31
+ using Entry = uint64_t;
32
+ using ExtractKey = trivial_extract_key;
33
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
34
+ using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
35
+
36
+ explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
46
37
 
47
38
  /**
48
39
  * Computes the a-not-b set operation given two sketches.
49
40
  * @return the result of a-not-b
50
41
  */
51
- compact_theta_sketch_alloc<A> compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered = true) const;
42
+ template<typename FwdSketch, typename Sketch>
43
+ CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
52
44
 
53
45
  private:
54
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
55
- uint16_t seed_hash_;
56
-
57
- class less_than {
58
- public:
59
- explicit less_than(uint64_t value): value(value) {}
60
- bool operator()(uint64_t value) const { return value < this->value; }
61
- private:
62
- uint64_t value;
63
- };
46
+ State state_;
64
47
  };
65
48
 
66
49
  // alias with default allocator for convenience
67
- typedef theta_a_not_b_alloc<std::allocator<void>> theta_a_not_b;
50
+ using theta_a_not_b = theta_a_not_b_alloc<std::allocator<uint64_t>>;
68
51
 
69
52
  } /* namespace datasketches */
70
53
 
@@ -26,56 +26,15 @@
26
26
 
27
27
  namespace datasketches {
28
28
 
29
- /*
30
- * author Alexander Saydakov
31
- * author Lee Rhodes
32
- * author Kevin Lang
33
- */
34
-
35
29
  template<typename A>
36
- theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
37
- seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
30
+ theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed, const A& allocator):
31
+ state_(seed, allocator)
38
32
  {}
39
33
 
40
34
  template<typename A>
41
- compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
42
- if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
43
- if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
44
- if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
45
-
46
- const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
47
- vector_u64<A> keys;
48
- bool is_empty = a.is_empty();
49
-
50
- if (b.get_num_retained() == 0) {
51
- std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
52
- } else {
53
- if (a.is_ordered() && b.is_ordered()) { // sort-based
54
- std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
55
- } else { // hash-based
56
- const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
57
- vector_u64<A> b_hash_table(1 << lg_size, 0);
58
- for (auto key: b) {
59
- if (key < theta) {
60
- update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
61
- } else if (b.is_ordered()) {
62
- break; // early stop
63
- }
64
- }
65
-
66
- // scan A lookup B
67
- for (auto key: a) {
68
- if (key < theta) {
69
- if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
70
- } else if (a.is_ordered()) {
71
- break; // early stop
72
- }
73
- }
74
- }
75
- }
76
- if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
77
- if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
78
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
35
+ template<typename FwdSketch, typename Sketch>
36
+ auto theta_a_not_b_alloc<A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const -> CompactSketch {
37
+ return state_.compute(std::forward<FwdSketch>(a), b, ordered);
79
38
  }
80
39
 
81
40
  } /* namespace datasketches */