datasketches 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -49,6 +49,21 @@ public:
49
49
  }
50
50
  };
51
51
 
52
+ template<bool dummy>
53
+ class theta_build_helper{
54
+ public:
55
+ // consistent way of initializing theta from p
56
+ // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
57
+ static uint64_t starting_theta_from_p(float p) {
58
+ if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
59
+ return theta_constants::MAX_THETA;
60
+ }
61
+
62
+ static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
63
+ return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
64
+ }
65
+ };
66
+
52
67
  } /* namespace datasketches */
53
68
 
54
69
  #endif
@@ -33,14 +33,19 @@ public:
33
33
  using Sketch = theta_sketch_alloc<Allocator>;
34
34
  using CompactSketch = compact_theta_sketch_alloc<Allocator>;
35
35
 
36
- struct pass_through_policy {
37
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
36
+ struct nop_policy {
37
+ void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
38
  unused(incoming_entry);
39
- return internal_entry;
39
+ unused(internal_entry);
40
40
  }
41
41
  };
42
- using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
42
+ using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
43
43
 
44
+ /*
45
+ * Constructor
46
+ * @param seed for the hash function that was used to create the sketch
47
+ * @param allocator to use for allocating and deallocating memory
48
+ */
44
49
  explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
45
50
 
46
51
  /**
@@ -29,7 +29,7 @@ template<typename EN, typename EK, typename P, typename S, typename CS, typename
29
29
  theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
30
30
  policy_(policy),
31
31
  is_valid_(false),
32
- table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
32
+ table_(0, 0, resize_factor::X1, 1, theta_constants::MAX_THETA, seed, allocator, false)
33
33
  {}
34
34
 
35
35
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
@@ -38,17 +38,17 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
38
38
  if (table_.is_empty_) return;
39
39
  if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
40
40
  table_.is_empty_ |= sketch.is_empty();
41
- table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
41
+ table_.theta_ = table_.is_empty_ ? theta_constants::MAX_THETA : std::min(table_.theta_, sketch.get_theta64());
42
42
  if (is_valid_ && table_.num_entries_ == 0) return;
43
43
  if (sketch.get_num_retained() == 0) {
44
44
  is_valid_ = true;
45
- table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
45
+ table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
46
46
  return;
47
47
  }
48
48
  if (!is_valid_) { // first update, copy or move incoming sketch
49
49
  is_valid_ = true;
50
50
  const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
51
- table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
51
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
52
52
  for (auto& entry: sketch) {
53
53
  auto result = table_.find(EK()(entry));
54
54
  if (result.second) {
@@ -83,11 +83,11 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
83
83
  throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
84
84
  }
85
85
  if (match_count == 0) {
86
- table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
86
+ table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
87
87
  if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
88
88
  } else {
89
89
  const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
90
- table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
90
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
91
91
  for (uint32_t i = 0; i < match_count; i++) {
92
92
  auto result = table_.find(EK()(matched_entries[i]));
93
93
  table_.insert(result.first, std::move(matched_entries[i]));
@@ -24,7 +24,7 @@ namespace datasketches {
24
24
 
25
25
  template<typename A>
26
26
  theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
27
- state_(seed, pass_through_policy(), allocator)
27
+ state_(seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -46,20 +46,21 @@ public:
46
46
  *
47
47
  * @param sketch_a given sketch A
48
48
  * @param sketch_b given sketch B
49
+ * @param seed for the hash function that was used to create the sketch
49
50
  * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
50
51
  * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
51
52
  */
52
53
  template<typename SketchA, typename SketchB>
53
- static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
54
+ static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
54
55
  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
55
56
  if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
56
57
  if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
57
58
 
58
- auto union_ab = compute_union(sketch_a, sketch_b);
59
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
59
60
  if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
60
61
 
61
62
  // intersection
62
- Intersection i;
63
+ Intersection i(seed);
63
64
  i.update(sketch_a);
64
65
  i.update(sketch_b);
65
66
  i.update(union_ab); // ensures that intersection is a subset of the union
@@ -76,15 +77,16 @@ public:
76
77
  * Returns true if the two given sketches are equivalent.
77
78
  * @param sketch_a the given sketch A
78
79
  * @param sketch_b the given sketch B
80
+ * @param seed for the hash function that was used to create the sketch
79
81
  * @return true if the two given sketches are exactly equal
80
82
  */
81
83
  template<typename SketchA, typename SketchB>
82
- static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
84
+ static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
83
85
  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
84
86
  if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
85
87
  if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
86
88
 
87
- auto union_ab = compute_union(sketch_a, sketch_b);
89
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
88
90
  if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
89
91
  return false;
90
92
  }
@@ -99,12 +101,13 @@ public:
99
101
  * @param actual the sketch to be tested
100
102
  * @param expected the reference sketch that is considered to be correct
101
103
  * @param threshold a real value between zero and one
104
+ * @param seed for the hash function that was used to create the sketch
102
105
  * @return true if the similarity of the two sketches is greater than the given threshold
103
106
  * with at least 97.7% confidence
104
107
  */
105
108
  template<typename SketchA, typename SketchB>
106
- static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
107
- auto jc = jaccard(actual, expected);
109
+ static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
110
+ auto jc = jaccard(actual, expected, seed);
108
111
  return jc[0] >= threshold;
109
112
  }
110
113
 
@@ -118,23 +121,24 @@ public:
118
121
  * @param actual the sketch to be tested
119
122
  * @param expected the reference sketch that is considered to be correct
120
123
  * @param threshold a real value between zero and one
124
+ * @param seed for the hash function that was used to create the sketch
121
125
  * @return true if the dissimilarity of the two sketches is greater than the given threshold
122
126
  * with at least 97.7% confidence
123
127
  */
124
128
  template<typename SketchA, typename SketchB>
125
- static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
126
- auto jc = jaccard(actual, expected);
129
+ static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
130
+ auto jc = jaccard(actual, expected, seed);
127
131
  return jc[2] <= threshold;
128
132
  }
129
133
 
130
134
  private:
131
135
 
132
136
  template<typename SketchA, typename SketchB>
133
- static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
134
- const unsigned count_a = sketch_a.get_num_retained();
135
- const unsigned count_b = sketch_b.get_num_retained();
136
- const unsigned lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
137
- auto u = typename Union::builder().set_lg_k(lg_k).build();
137
+ static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
138
+ const auto count_a = sketch_a.get_num_retained();
139
+ const auto count_b = sketch_b.get_num_retained();
140
+ const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
141
+ auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
138
142
  u.update(sketch_a);
139
143
  u.update(sketch_b);
140
144
  return u.get_result(false);
@@ -36,7 +36,7 @@ seed_hash_(compute_seed_hash(seed))
36
36
  template<typename EN, typename EK, typename CS, typename A>
37
37
  template<typename FwdSketch, typename Sketch>
38
38
  CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
39
- if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return CS(a, ordered);
39
+ if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
40
40
  if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
41
41
  if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
42
42
 
@@ -53,7 +53,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
53
53
  conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
54
54
  } else { // hash-based
55
55
  const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
56
- hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
56
+ hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
57
57
  for (const auto& entry: b) {
58
58
  const uint64_t hash = EK()(entry);
59
59
  if (hash < theta) {
@@ -25,14 +25,10 @@
25
25
  namespace datasketches {
26
26
 
27
27
  template<typename Allocator = std::allocator<uint64_t>>
28
- class theta_sketch_alloc {
28
+ class base_theta_sketch_alloc {
29
29
  public:
30
- using Entry = uint64_t;
31
- using ExtractKey = trivial_extract_key;
32
- using iterator = theta_iterator<Entry, ExtractKey>;
33
- using const_iterator = theta_const_iterator<Entry, ExtractKey>;
34
30
 
35
- virtual ~theta_sketch_alloc() = default;
31
+ virtual ~base_theta_sketch_alloc() = default;
36
32
 
37
33
  /**
38
34
  * @return allocator
@@ -104,6 +100,21 @@ public:
104
100
  */
105
101
  virtual string<Allocator> to_string(bool print_items = false) const;
106
102
 
103
+ protected:
104
+ virtual void print_specifics(std::ostringstream& os) const = 0;
105
+ virtual void print_items(std::ostringstream& os) const = 0;
106
+ };
107
+
108
+ template<typename Allocator = std::allocator<uint64_t>>
109
+ class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
110
+ public:
111
+ using Entry = uint64_t;
112
+ using ExtractKey = trivial_extract_key;
113
+ using iterator = theta_iterator<Entry, ExtractKey>;
114
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
115
+
116
+ virtual ~theta_sketch_alloc() = default;
117
+
107
118
  /**
108
119
  * Iterator over hash values in this sketch.
109
120
  * @return begin iterator
@@ -131,8 +142,7 @@ public:
131
142
  virtual const_iterator end() const = 0;
132
143
 
133
144
  protected:
134
- using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
135
- virtual void print_specifics(ostrstream& os) const = 0;
145
+ virtual void print_items(std::ostringstream& os) const;
136
146
  };
137
147
 
138
148
  // forward declaration
@@ -269,6 +279,11 @@ public:
269
279
  */
270
280
  void trim();
271
281
 
282
+ /**
283
+ * Reset the sketch to the initial empty state
284
+ */
285
+ void reset();
286
+
272
287
  /**
273
288
  * Converts this sketch to a compact sketch (ordered or unordered).
274
289
  * @param ordered optional flag to specify if ordered sketch should be produced
@@ -285,11 +300,10 @@ private:
285
300
  theta_table table_;
286
301
 
287
302
  // for builder
288
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
289
- uint64_t seed, const Allocator& allocator);
303
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
304
+ uint64_t theta, uint64_t seed, const Allocator& allocator);
290
305
 
291
- using ostrstream = typename Base::ostrstream;
292
- virtual void print_specifics(ostrstream& os) const;
306
+ virtual void print_specifics(std::ostringstream& os) const;
293
307
  };
294
308
 
295
309
  // compact sketch
@@ -311,7 +325,8 @@ public:
311
325
  // - as a result of a set operation
312
326
  // - by deserializing a previously serialized compact sketch
313
327
 
314
- compact_theta_sketch_alloc(const Base& other, bool ordered);
328
+ template<typename Other>
329
+ compact_theta_sketch_alloc(const Other& other, bool ordered);
315
330
  compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
316
331
  compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
317
332
  virtual ~compact_theta_sketch_alloc() = default;
@@ -376,8 +391,7 @@ private:
376
391
  uint64_t theta_;
377
392
  std::vector<uint64_t, Allocator> entries_;
378
393
 
379
- using ostrstream = typename Base::ostrstream;
380
- virtual void print_specifics(ostrstream& os) const;
394
+ virtual void print_specifics(std::ostringstream& os) const;
381
395
  };
382
396
 
383
397
  template<typename Allocator>
@@ -387,10 +401,54 @@ public:
387
401
  update_theta_sketch_alloc build() const;
388
402
  };
389
403
 
404
+ // This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
405
+ // It does not take the ownership of the buffer.
406
+
407
+ template<typename Allocator = std::allocator<uint64_t>>
408
+ class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
409
+ public:
410
+ using const_iterator = const uint64_t*;
411
+
412
+ Allocator get_allocator() const;
413
+ bool is_empty() const;
414
+ bool is_ordered() const;
415
+ uint64_t get_theta64() const;
416
+ uint32_t get_num_retained() const;
417
+ uint16_t get_seed_hash() const;
418
+
419
+ const_iterator begin() const;
420
+ const_iterator end() const;
421
+
422
+ /**
423
+ * This method wraps a serialized compact sketch as an array of bytes.
424
+ * @param bytes pointer to the array of bytes
425
+ * @param size the size of the array
426
+ * @param seed the seed for the hash function that was used to create the sketch
427
+ * @return an instance of the sketch
428
+ */
429
+ static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
430
+
431
+ protected:
432
+ virtual void print_specifics(std::ostringstream& os) const;
433
+ virtual void print_items(std::ostringstream& os) const;
434
+
435
+ private:
436
+ bool is_empty_;
437
+ bool is_ordered_;
438
+ uint16_t seed_hash_;
439
+ uint32_t num_entries_;
440
+ uint64_t theta_;
441
+ const uint64_t* entries_;
442
+
443
+ wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
444
+ uint64_t theta, const uint64_t* entries);
445
+ };
446
+
390
447
  // aliases with default allocator for convenience
391
448
  using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
392
449
  using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
393
450
  using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
451
+ using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
394
452
 
395
453
  } /* namespace datasketches */
396
454