datasketches 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -51,7 +51,7 @@ class var_opt_union {
51
51
  public:
52
52
  static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
53
53
 
54
- explicit var_opt_union(uint32_t max_k);
54
+ explicit var_opt_union(uint32_t max_k, const A& allocator = A());
55
55
  var_opt_union(const var_opt_union& other);
56
56
  var_opt_union(var_opt_union&& other) noexcept;
57
57
 
@@ -119,16 +119,16 @@ public:
119
119
  * @param is input stream
120
120
  * @return an instance of a union
121
121
  */
122
- static var_opt_union deserialize(std::istream& is);
122
+ static var_opt_union deserialize(std::istream& is, const A& allocator = A());
123
123
 
124
124
  /**
125
125
  * NOTE: This method may be deprecated in a future version.
126
- * This method deserializes a skeuniontch from a given array of bytes.
126
+ * This method deserializes a union from a given array of bytes.
127
127
  * @param bytes pointer to the array of bytes
128
128
  * @param size the size of the array
129
129
  * @return an instance of a union
130
130
  */
131
- static var_opt_union deserialize(const void* bytes, size_t size);
131
+ static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
132
132
 
133
133
  /**
134
134
  * Prints a summary of the union as a string.
@@ -236,4 +236,4 @@ private:
236
236
 
237
237
  #include "var_opt_union_impl.hpp"
238
238
 
239
- #endif // _VAR_OPT_UNION_HPP_
239
+ #endif // _VAR_OPT_UNION_HPP_
@@ -28,12 +28,12 @@
28
28
  namespace datasketches {
29
29
 
30
30
  template<typename T, typename S, typename A>
31
- var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
31
+ var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
32
32
  n_(0),
33
33
  outer_tau_numer_(0),
34
34
  outer_tau_denom_(0.0),
35
35
  max_k_(max_k),
36
- gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
36
+ gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
37
37
  {}
38
38
 
39
39
  template<typename T, typename S, typename A>
@@ -128,7 +128,7 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
128
128
  */
129
129
 
130
130
  template<typename T, typename S, typename A>
131
- var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
131
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
132
132
  uint8_t preamble_longs;
133
133
  is.read((char*)&preamble_longs, sizeof(preamble_longs));
134
134
  uint8_t serial_version;
@@ -163,7 +163,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
163
163
  uint64_t outer_tau_denom;
164
164
  is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
165
165
 
166
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is);
166
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
167
167
 
168
168
  if (!is.good())
169
169
  throw std::runtime_error("error reading from std::istream");
@@ -172,7 +172,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
172
172
  }
173
173
 
174
174
  template<typename T, typename S, typename A>
175
- var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size) {
175
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
176
176
  ensure_minimum_memory(size, 8);
177
177
  const char* ptr = static_cast<const char*>(bytes);
178
178
  uint8_t preamble_longs;
@@ -207,7 +207,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
207
207
  ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
208
208
 
209
209
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
210
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size);
210
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
211
211
 
212
212
  return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
213
213
  }
@@ -255,7 +255,7 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
255
255
  template<typename T, typename S, typename A>
256
256
  std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
257
257
  const size_t size = header_size_bytes + get_serialized_size_bytes();
258
- std::vector<uint8_t, AllocU8<A>> bytes(size);
258
+ std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
259
259
  uint8_t* ptr = bytes.data() + header_size_bytes;
260
260
 
261
261
  const bool empty = n_ == 0;
@@ -40,4 +40,5 @@ target_sources(sampling_test
40
40
  PRIVATE
41
41
  var_opt_sketch_test.cpp
42
42
  var_opt_union_test.cpp
43
+ var_opt_allocation_test.cpp
43
44
  )
@@ -0,0 +1,96 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <var_opt_sketch.hpp>
21
+ #include <var_opt_union.hpp>
22
+ #include <test_type.hpp>
23
+ #include <test_allocator.hpp>
24
+
25
+ #include <catch.hpp>
26
+
27
+ #include <sstream>
28
+
29
+ namespace datasketches {
30
+
31
+ using var_opt_test_sketch = var_opt_sketch<test_type, test_type_serde, test_allocator<test_type>>;
32
+ using var_opt_test_union = var_opt_union<test_type, test_type_serde, test_allocator<test_type>>;
33
+ using alloc = test_allocator<test_type>;
34
+
35
+ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
36
+ test_allocator_total_bytes = 0;
37
+ test_allocator_net_allocations = 0;
38
+ {
39
+ var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
40
+ for (int i = 0; i < 100; ++i) sk1.update(i);
41
+ auto bytes1 = sk1.serialize();
42
+ auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), 0);
43
+
44
+ std::stringstream ss;
45
+ sk1.serialize(ss);
46
+ auto sk3 = var_opt_test_sketch::deserialize(ss, alloc(0));
47
+
48
+ var_opt_test_union u1(10, 0);
49
+ u1.update(sk1);
50
+ u1.update(sk2);
51
+ u1.update(sk3);
52
+
53
+ auto bytes2 = u1.serialize();
54
+ auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), 0);
55
+ }
56
+ REQUIRE(test_allocator_total_bytes == 0);
57
+ REQUIRE(test_allocator_net_allocations == 0);
58
+ }
59
+
60
+ TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
61
+ test_allocator_total_bytes = 0;
62
+ test_allocator_net_allocations = 0;
63
+ {
64
+ uint32_t n = 20;
65
+ uint32_t k = 5;
66
+ var_opt_test_union u(k, 0);
67
+ var_opt_test_sketch sk1(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
68
+ var_opt_test_sketch sk2(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
69
+
70
+ // move udpates
71
+ for (int i = 0; i < (int) n; ++i) {
72
+ sk1.update(i);
73
+ sk2.update(-i);
74
+ }
75
+ REQUIRE(sk1.get_n() == n);
76
+ REQUIRE(sk2.get_n() == n);
77
+
78
+ // move unions
79
+ u.update(std::move(sk2));
80
+ u.update(std::move(sk1));
81
+ REQUIRE(u.get_result().get_n() == 2 * n);
82
+
83
+ // move constructor
84
+ var_opt_test_union u2(std::move(u));
85
+ REQUIRE(u2.get_result().get_n() == 2 * n);
86
+
87
+ // move assignment
88
+ var_opt_test_union u3(k, 0);
89
+ u3 = std::move(u2);
90
+ REQUIRE(u3.get_result().get_n() == 2 * n);
91
+ }
92
+ REQUIRE(test_allocator_total_bytes == 0);
93
+ REQUIRE(test_allocator_net_allocations == 0);
94
+ }
95
+
96
+ }
@@ -18,7 +18,6 @@
18
18
  */
19
19
 
20
20
  #include <var_opt_union.hpp>
21
- #include "test_type.hpp"
22
21
 
23
22
  #include <catch.hpp>
24
23
 
@@ -325,34 +324,4 @@ TEST_CASE("varopt union: deserialize from java", "[var_opt_union]") {
325
324
  REQUIRE(result.get_k() < 128);
326
325
  }
327
326
 
328
- TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
329
- uint32_t n = 20;
330
- uint32_t k = 5;
331
- var_opt_union<test_type> u(k);
332
- var_opt_sketch<test_type> sk1(k);
333
- var_opt_sketch<test_type> sk2(k);
334
-
335
- // move udpates
336
- for (int i = 0; i < (int) n; ++i) {
337
- sk1.update(i);
338
- sk2.update(-i);
339
- }
340
- REQUIRE(sk1.get_n() == n);
341
- REQUIRE(sk2.get_n() == n);
342
-
343
- // move unions
344
- u.update(std::move(sk2));
345
- u.update(std::move(sk1));
346
- REQUIRE(u.get_result().get_n() == 2 * n);
347
-
348
- // move constructor
349
- var_opt_union<test_type> u2(std::move(u));
350
- REQUIRE(u2.get_result().get_n() == 2 * n);
351
-
352
- // move assignment
353
- var_opt_union<test_type> u3(k);
354
- u3 = std::move(u2);
355
- REQUIRE(u3.get_result().get_n() == 2 * n);
356
- }
357
-
358
327
  }
@@ -49,6 +49,8 @@ class CMakeBuild(build_ext):
49
49
  os.path.dirname(self.get_ext_fullpath(ext.name)))
50
50
  cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir]
51
51
  cmake_args += ['-DWITH_PYTHON=True']
52
+ # ensure we use a consistent python version
53
+ cmake_args += ['-DPYTHON_EXECUTABLE=' + sys.executable]
52
54
  cfg = 'Debug' if self.debug else 'Release'
53
55
  build_args = ['--config', cfg]
54
56
 
@@ -77,10 +79,10 @@ class CMakeBuild(build_ext):
77
79
 
78
80
  setup(
79
81
  name='datasketches',
80
- version='2.2.0-SNAPSHOT',
81
- author='Datasketches Developers',
82
+ version='3.0.0',
83
+ author='Apache DataSketches Developers',
82
84
  author_email='dev@datasketches.apache.org',
83
- description='A wrapper for the C++ Datasketches library',
85
+ description='A wrapper for the C++ Apache DataSketches library',
84
86
  license='Apache License 2.0',
85
87
  url='http://datasketches.apache.org',
86
88
  long_description=open('python/README.md').read(),
@@ -33,9 +33,21 @@ target_link_libraries(theta INTERFACE common)
33
33
  target_compile_features(theta INTERFACE cxx_std_11)
34
34
 
35
35
  set(theta_HEADERS "")
36
- list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
37
- list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_sketch_impl.hpp")
38
- list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
36
+ list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
37
+ list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
38
+ list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
39
+ list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
40
+ list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
41
+ list(APPEND theta_HEADERS "include/theta_comparators.hpp")
42
+ list(APPEND theta_HEADERS "include/theta_constants.hpp")
43
+ list(APPEND theta_HEADERS "include/theta_helpers.hpp")
44
+ list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
45
+ list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
46
+ list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
47
+ list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
48
+ list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
49
+ list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
50
+ list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
39
51
 
40
52
  install(TARGETS theta
41
53
  EXPORT ${PROJECT_NAME}
@@ -54,4 +66,19 @@ target_sources(theta
54
66
  ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
55
67
  ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
56
68
  ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
69
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
70
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
71
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
72
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
73
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
74
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
75
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
76
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
77
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
78
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
79
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
80
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
81
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
82
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
83
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
57
84
  )
@@ -21,8 +21,9 @@
21
21
  #define BOUNDS_ON_RATIOS_IN_SAMPLED_SETS_HPP_
22
22
 
23
23
  #include <cstdint>
24
+ #include <string>
24
25
 
25
- #include <bounds_binomial_proportions.hpp>
26
+ #include "bounds_binomial_proportions.hpp"
26
27
 
27
28
  namespace datasketches {
28
29
 
@@ -23,7 +23,7 @@
23
23
  #include <cstdint>
24
24
  #include <stdexcept>
25
25
 
26
- #include <bounds_on_ratios_in_sampled_sets.hpp>
26
+ #include "bounds_on_ratios_in_sampled_sets.hpp"
27
27
 
28
28
  namespace datasketches {
29
29
 
@@ -20,51 +20,34 @@
20
20
  #ifndef THETA_A_NOT_B_HPP_
21
21
  #define THETA_A_NOT_B_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
-
27
23
  #include "theta_sketch.hpp"
28
- #include "common_defs.hpp"
24
+ #include "theta_set_difference_base.hpp"
29
25
 
30
26
  namespace datasketches {
31
27
 
32
- /*
33
- * author Alexander Saydakov
34
- * author Lee Rhodes
35
- * author Kevin Lang
36
- */
37
-
38
- template<typename A>
28
+ template<typename Allocator = std::allocator<uint64_t>>
39
29
  class theta_a_not_b_alloc {
40
30
  public:
41
- /**
42
- * Creates an instance of the a-not-b operation (set difference) with a given has seed.
43
- * @param seed hash seed
44
- */
45
- explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED);
31
+ using Entry = uint64_t;
32
+ using ExtractKey = trivial_extract_key;
33
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
34
+ using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
35
+
36
+ explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
46
37
 
47
38
  /**
48
39
  * Computes the a-not-b set operation given two sketches.
49
40
  * @return the result of a-not-b
50
41
  */
51
- compact_theta_sketch_alloc<A> compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered = true) const;
42
+ template<typename FwdSketch, typename Sketch>
43
+ CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
52
44
 
53
45
  private:
54
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
55
- uint16_t seed_hash_;
56
-
57
- class less_than {
58
- public:
59
- explicit less_than(uint64_t value): value(value) {}
60
- bool operator()(uint64_t value) const { return value < this->value; }
61
- private:
62
- uint64_t value;
63
- };
46
+ State state_;
64
47
  };
65
48
 
66
49
  // alias with default allocator for convenience
67
- typedef theta_a_not_b_alloc<std::allocator<void>> theta_a_not_b;
50
+ using theta_a_not_b = theta_a_not_b_alloc<std::allocator<uint64_t>>;
68
51
 
69
52
  } /* namespace datasketches */
70
53
 
@@ -26,56 +26,15 @@
26
26
 
27
27
  namespace datasketches {
28
28
 
29
- /*
30
- * author Alexander Saydakov
31
- * author Lee Rhodes
32
- * author Kevin Lang
33
- */
34
-
35
29
  template<typename A>
36
- theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
37
- seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
30
+ theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed, const A& allocator):
31
+ state_(seed, allocator)
38
32
  {}
39
33
 
40
34
  template<typename A>
41
- compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
42
- if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
43
- if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
44
- if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
45
-
46
- const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
47
- vector_u64<A> keys;
48
- bool is_empty = a.is_empty();
49
-
50
- if (b.get_num_retained() == 0) {
51
- std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
52
- } else {
53
- if (a.is_ordered() && b.is_ordered()) { // sort-based
54
- std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
55
- } else { // hash-based
56
- const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
57
- vector_u64<A> b_hash_table(1 << lg_size, 0);
58
- for (auto key: b) {
59
- if (key < theta) {
60
- update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
61
- } else if (b.is_ordered()) {
62
- break; // early stop
63
- }
64
- }
65
-
66
- // scan A lookup B
67
- for (auto key: a) {
68
- if (key < theta) {
69
- if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
70
- } else if (a.is_ordered()) {
71
- break; // early stop
72
- }
73
- }
74
- }
75
- }
76
- if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
77
- if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
78
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
35
+ template<typename FwdSketch, typename Sketch>
36
+ auto theta_a_not_b_alloc<A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const -> CompactSketch {
37
+ return state_.compute(std::forward<FwdSketch>(a), b, ordered);
79
38
  }
80
39
 
81
40
  } /* namespace datasketches */