datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -49,6 +49,21 @@ public:
49
49
  }
50
50
  };
51
51
 
52
+ template<bool dummy>
53
+ class theta_build_helper{
54
+ public:
55
+ // consistent way of initializing theta from p
56
+ // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
57
+ static uint64_t starting_theta_from_p(float p) {
58
+ if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
59
+ return theta_constants::MAX_THETA;
60
+ }
61
+
62
+ static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
63
+ return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
64
+ }
65
+ };
66
+
52
67
  } /* namespace datasketches */
53
68
 
54
69
  #endif
@@ -33,14 +33,19 @@ public:
33
33
  using Sketch = theta_sketch_alloc<Allocator>;
34
34
  using CompactSketch = compact_theta_sketch_alloc<Allocator>;
35
35
 
36
- struct pass_through_policy {
37
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
36
+ struct nop_policy {
37
+ void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
38
  unused(incoming_entry);
39
- return internal_entry;
39
+ unused(internal_entry);
40
40
  }
41
41
  };
42
- using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
42
+ using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
43
43
 
44
+ /*
45
+ * Constructor
46
+ * @param seed for the hash function that was used to create the sketch
47
+ * @param allocator to use for allocating and deallocating memory
48
+ */
44
49
  explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
45
50
 
46
51
  /**
@@ -29,7 +29,7 @@ template<typename EN, typename EK, typename P, typename S, typename CS, typename
29
29
  theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
30
30
  policy_(policy),
31
31
  is_valid_(false),
32
- table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
32
+ table_(0, 0, resize_factor::X1, 1, theta_constants::MAX_THETA, seed, allocator, false)
33
33
  {}
34
34
 
35
35
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
@@ -38,17 +38,17 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
38
38
  if (table_.is_empty_) return;
39
39
  if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
40
40
  table_.is_empty_ |= sketch.is_empty();
41
- table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
41
+ table_.theta_ = table_.is_empty_ ? theta_constants::MAX_THETA : std::min(table_.theta_, sketch.get_theta64());
42
42
  if (is_valid_ && table_.num_entries_ == 0) return;
43
43
  if (sketch.get_num_retained() == 0) {
44
44
  is_valid_ = true;
45
- table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
45
+ table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
46
46
  return;
47
47
  }
48
48
  if (!is_valid_) { // first update, copy or move incoming sketch
49
49
  is_valid_ = true;
50
50
  const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
51
- table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
51
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
52
52
  for (auto& entry: sketch) {
53
53
  auto result = table_.find(EK()(entry));
54
54
  if (result.second) {
@@ -83,11 +83,11 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
83
83
  throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
84
84
  }
85
85
  if (match_count == 0) {
86
- table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
86
+ table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
87
87
  if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
88
88
  } else {
89
89
  const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
90
- table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
90
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
91
91
  for (uint32_t i = 0; i < match_count; i++) {
92
92
  auto result = table_.find(EK()(matched_entries[i]));
93
93
  table_.insert(result.first, std::move(matched_entries[i]));
@@ -24,7 +24,7 @@ namespace datasketches {
24
24
 
25
25
  template<typename A>
26
26
  theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
27
- state_(seed, pass_through_policy(), allocator)
27
+ state_(seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -46,20 +46,21 @@ public:
46
46
  *
47
47
  * @param sketch_a given sketch A
48
48
  * @param sketch_b given sketch B
49
+ * @param seed for the hash function that was used to create the sketch
49
50
  * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
50
51
  * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
51
52
  */
52
53
  template<typename SketchA, typename SketchB>
53
- static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
54
+ static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
54
55
  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
55
56
  if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
56
57
  if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
57
58
 
58
- auto union_ab = compute_union(sketch_a, sketch_b);
59
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
59
60
  if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
60
61
 
61
62
  // intersection
62
- Intersection i;
63
+ Intersection i(seed);
63
64
  i.update(sketch_a);
64
65
  i.update(sketch_b);
65
66
  i.update(union_ab); // ensures that intersection is a subset of the union
@@ -76,15 +77,16 @@ public:
76
77
  * Returns true if the two given sketches are equivalent.
77
78
  * @param sketch_a the given sketch A
78
79
  * @param sketch_b the given sketch B
80
+ * @param seed for the hash function that was used to create the sketch
79
81
  * @return true if the two given sketches are exactly equal
80
82
  */
81
83
  template<typename SketchA, typename SketchB>
82
- static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
84
+ static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
83
85
  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
84
86
  if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
85
87
  if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
86
88
 
87
- auto union_ab = compute_union(sketch_a, sketch_b);
89
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
88
90
  if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
89
91
  return false;
90
92
  }
@@ -99,12 +101,13 @@ public:
99
101
  * @param actual the sketch to be tested
100
102
  * @param expected the reference sketch that is considered to be correct
101
103
  * @param threshold a real value between zero and one
104
+ * @param seed for the hash function that was used to create the sketch
102
105
  * @return true if the similarity of the two sketches is greater than the given threshold
103
106
  * with at least 97.7% confidence
104
107
  */
105
108
  template<typename SketchA, typename SketchB>
106
- static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
107
- auto jc = jaccard(actual, expected);
109
+ static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
110
+ auto jc = jaccard(actual, expected, seed);
108
111
  return jc[0] >= threshold;
109
112
  }
110
113
 
@@ -118,23 +121,24 @@ public:
118
121
  * @param actual the sketch to be tested
119
122
  * @param expected the reference sketch that is considered to be correct
120
123
  * @param threshold a real value between zero and one
124
+ * @param seed for the hash function that was used to create the sketch
121
125
  * @return true if the dissimilarity of the two sketches is greater than the given threshold
122
126
  * with at least 97.7% confidence
123
127
  */
124
128
  template<typename SketchA, typename SketchB>
125
- static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
126
- auto jc = jaccard(actual, expected);
129
+ static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
130
+ auto jc = jaccard(actual, expected, seed);
127
131
  return jc[2] <= threshold;
128
132
  }
129
133
 
130
134
  private:
131
135
 
132
136
  template<typename SketchA, typename SketchB>
133
- static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
134
- const unsigned count_a = sketch_a.get_num_retained();
135
- const unsigned count_b = sketch_b.get_num_retained();
136
- const unsigned lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
137
- auto u = typename Union::builder().set_lg_k(lg_k).build();
137
+ static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
138
+ const auto count_a = sketch_a.get_num_retained();
139
+ const auto count_b = sketch_b.get_num_retained();
140
+ const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
141
+ auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
138
142
  u.update(sketch_a);
139
143
  u.update(sketch_b);
140
144
  return u.get_result(false);
@@ -36,7 +36,7 @@ seed_hash_(compute_seed_hash(seed))
36
36
  template<typename EN, typename EK, typename CS, typename A>
37
37
  template<typename FwdSketch, typename Sketch>
38
38
  CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
39
- if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return CS(a, ordered);
39
+ if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
40
40
  if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
41
41
  if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
42
42
 
@@ -53,7 +53,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
53
53
  conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
54
54
  } else { // hash-based
55
55
  const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
56
- hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
56
+ hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
57
57
  for (const auto& entry: b) {
58
58
  const uint64_t hash = EK()(entry);
59
59
  if (hash < theta) {
@@ -25,14 +25,10 @@
25
25
  namespace datasketches {
26
26
 
27
27
  template<typename Allocator = std::allocator<uint64_t>>
28
- class theta_sketch_alloc {
28
+ class base_theta_sketch_alloc {
29
29
  public:
30
- using Entry = uint64_t;
31
- using ExtractKey = trivial_extract_key;
32
- using iterator = theta_iterator<Entry, ExtractKey>;
33
- using const_iterator = theta_const_iterator<Entry, ExtractKey>;
34
30
 
35
- virtual ~theta_sketch_alloc() = default;
31
+ virtual ~base_theta_sketch_alloc() = default;
36
32
 
37
33
  /**
38
34
  * @return allocator
@@ -104,6 +100,21 @@ public:
104
100
  */
105
101
  virtual string<Allocator> to_string(bool print_items = false) const;
106
102
 
103
+ protected:
104
+ virtual void print_specifics(std::ostringstream& os) const = 0;
105
+ virtual void print_items(std::ostringstream& os) const = 0;
106
+ };
107
+
108
+ template<typename Allocator = std::allocator<uint64_t>>
109
+ class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
110
+ public:
111
+ using Entry = uint64_t;
112
+ using ExtractKey = trivial_extract_key;
113
+ using iterator = theta_iterator<Entry, ExtractKey>;
114
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
115
+
116
+ virtual ~theta_sketch_alloc() = default;
117
+
107
118
  /**
108
119
  * Iterator over hash values in this sketch.
109
120
  * @return begin iterator
@@ -131,8 +142,7 @@ public:
131
142
  virtual const_iterator end() const = 0;
132
143
 
133
144
  protected:
134
- using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
135
- virtual void print_specifics(ostrstream& os) const = 0;
145
+ virtual void print_items(std::ostringstream& os) const;
136
146
  };
137
147
 
138
148
  // forward declaration
@@ -269,6 +279,11 @@ public:
269
279
  */
270
280
  void trim();
271
281
 
282
+ /**
283
+ * Reset the sketch to the initial empty state
284
+ */
285
+ void reset();
286
+
272
287
  /**
273
288
  * Converts this sketch to a compact sketch (ordered or unordered).
274
289
  * @param ordered optional flag to specify if ordered sketch should be produced
@@ -285,11 +300,10 @@ private:
285
300
  theta_table table_;
286
301
 
287
302
  // for builder
288
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
289
- uint64_t seed, const Allocator& allocator);
303
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
304
+ uint64_t theta, uint64_t seed, const Allocator& allocator);
290
305
 
291
- using ostrstream = typename Base::ostrstream;
292
- virtual void print_specifics(ostrstream& os) const;
306
+ virtual void print_specifics(std::ostringstream& os) const;
293
307
  };
294
308
 
295
309
  // compact sketch
@@ -311,7 +325,8 @@ public:
311
325
  // - as a result of a set operation
312
326
  // - by deserializing a previously serialized compact sketch
313
327
 
314
- compact_theta_sketch_alloc(const Base& other, bool ordered);
328
+ template<typename Other>
329
+ compact_theta_sketch_alloc(const Other& other, bool ordered);
315
330
  compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
316
331
  compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
317
332
  virtual ~compact_theta_sketch_alloc() = default;
@@ -376,8 +391,7 @@ private:
376
391
  uint64_t theta_;
377
392
  std::vector<uint64_t, Allocator> entries_;
378
393
 
379
- using ostrstream = typename Base::ostrstream;
380
- virtual void print_specifics(ostrstream& os) const;
394
+ virtual void print_specifics(std::ostringstream& os) const;
381
395
  };
382
396
 
383
397
  template<typename Allocator>
@@ -387,10 +401,54 @@ public:
387
401
  update_theta_sketch_alloc build() const;
388
402
  };
389
403
 
404
+ // This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
405
+ // It does not take the ownership of the buffer.
406
+
407
+ template<typename Allocator = std::allocator<uint64_t>>
408
+ class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
409
+ public:
410
+ using const_iterator = const uint64_t*;
411
+
412
+ Allocator get_allocator() const;
413
+ bool is_empty() const;
414
+ bool is_ordered() const;
415
+ uint64_t get_theta64() const;
416
+ uint32_t get_num_retained() const;
417
+ uint16_t get_seed_hash() const;
418
+
419
+ const_iterator begin() const;
420
+ const_iterator end() const;
421
+
422
+ /**
423
+ * This method wraps a serialized compact sketch as an array of bytes.
424
+ * @param bytes pointer to the array of bytes
425
+ * @param size the size of the array
426
+ * @param seed the seed for the hash function that was used to create the sketch
427
+ * @return an instance of the sketch
428
+ */
429
+ static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
430
+
431
+ protected:
432
+ virtual void print_specifics(std::ostringstream& os) const;
433
+ virtual void print_items(std::ostringstream& os) const;
434
+
435
+ private:
436
+ bool is_empty_;
437
+ bool is_ordered_;
438
+ uint16_t seed_hash_;
439
+ uint32_t num_entries_;
440
+ uint64_t theta_;
441
+ const uint64_t* entries_;
442
+
443
+ wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
444
+ uint64_t theta, const uint64_t* entries);
445
+ };
446
+
390
447
  // aliases with default allocator for convenience
391
448
  using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
392
449
  using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
393
450
  using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
451
+ using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
394
452
 
395
453
  } /* namespace datasketches */
396
454