datasketches 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -20,6 +20,8 @@
20
20
  #ifndef THETA_CONSTANTS_HPP_
21
21
  #define THETA_CONSTANTS_HPP_
22
22
 
23
+ #include <climits>
24
+
23
25
  namespace datasketches {
24
26
 
25
27
  namespace theta_constants {
@@ -20,29 +20,28 @@
20
20
  #ifndef THETA_INTERSECTION_HPP_
21
21
  #define THETA_INTERSECTION_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
-
27
23
  #include "theta_sketch.hpp"
28
- #include "common_defs.hpp"
24
+ #include "theta_intersection_base.hpp"
29
25
 
30
26
  namespace datasketches {
31
27
 
32
- /*
33
- * author Alexander Saydakov
34
- * author Lee Rhodes
35
- * author Kevin Lang
36
- */
37
-
38
- template<typename A>
28
+ template<typename Allocator = std::allocator<uint64_t>>
39
29
  class theta_intersection_alloc {
40
30
  public:
41
- /**
42
- * Creates an instance of the intersection with a given hash seed.
43
- * @param seed hash seed
44
- */
45
- explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED);
31
+ using Entry = uint64_t;
32
+ using ExtractKey = trivial_extract_key;
33
+ using Sketch = theta_sketch_alloc<Allocator>;
34
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
35
+
36
+ struct pass_through_policy {
37
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
+ unused(incoming_entry);
39
+ return internal_entry;
40
+ }
41
+ };
42
+ using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
43
+
44
+ explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
46
45
 
47
46
  /**
48
47
  * Updates the intersection with a given sketch.
@@ -50,7 +49,8 @@ public:
50
49
  * can reduce the current set to leave the overlapping subset only.
51
50
  * @param sketch represents input set for the intersection
52
51
  */
53
- void update(const theta_sketch_alloc<A>& sketch);
52
+ template<typename FwdSketch>
53
+ void update(FwdSketch&& sketch);
54
54
 
55
55
  /**
56
56
  * Produces a copy of the current state of the intersection.
@@ -59,7 +59,7 @@ public:
59
59
  * @param ordered optional flag to specify if ordered sketch should be produced
60
60
  * @return the result of the intersection
61
61
  */
62
- compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
62
+ CompactSketch get_result(bool ordered = true) const;
63
63
 
64
64
  /**
65
65
  * Returns true if the state of the intersection is defined (not infinite "universe").
@@ -68,21 +68,14 @@ public:
68
68
  bool has_result() const;
69
69
 
70
70
  private:
71
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
72
- bool is_valid_;
73
- bool is_empty_;
74
- uint64_t theta_;
75
- uint8_t lg_size_;
76
- vector_u64<A> keys_;
77
- uint32_t num_keys_;
78
- uint16_t seed_hash_;
71
+ State state_;
79
72
  };
80
73
 
81
74
  // alias with default allocator for convenience
82
- typedef theta_intersection_alloc<std::allocator<void>> theta_intersection;
75
+ using theta_intersection = theta_intersection_alloc<std::allocator<uint64_t>>;
83
76
 
84
77
  } /* namespace datasketches */
85
78
 
86
79
  #include "theta_intersection_impl.hpp"
87
80
 
88
- # endif
81
+ #endif
@@ -20,109 +20,27 @@
20
20
  #ifndef THETA_INTERSECTION_IMPL_HPP_
21
21
  #define THETA_INTERSECTION_IMPL_HPP_
22
22
 
23
- #include <algorithm>
24
-
25
23
  namespace datasketches {
26
24
 
27
- /*
28
- * author Alexander Saydakov
29
- * author Lee Rhodes
30
- * author Kevin Lang
31
- */
32
-
33
25
  template<typename A>
34
- theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed):
35
- is_valid_(false),
36
- is_empty_(false),
37
- theta_(theta_sketch_alloc<A>::MAX_THETA),
38
- lg_size_(0),
39
- keys_(),
40
- num_keys_(0),
41
- seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
26
+ theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
27
+ state_(seed, pass_through_policy(), allocator)
42
28
  {}
43
29
 
44
30
  template<typename A>
45
- void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
46
- if (is_empty_) return;
47
- if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
48
- is_empty_ |= sketch.is_empty();
49
- theta_ = std::min(theta_, sketch.get_theta64());
50
- if (is_valid_ && num_keys_ == 0) return;
51
- if (sketch.get_num_retained() == 0) {
52
- is_valid_ = true;
53
- if (keys_.size() > 0) {
54
- keys_.resize(0);
55
- lg_size_ = 0;
56
- num_keys_ = 0;
57
- }
58
- return;
59
- }
60
- if (!is_valid_) { // first update, clone incoming sketch
61
- is_valid_ = true;
62
- lg_size_ = lg_size_from_count(sketch.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
63
- keys_.resize(1 << lg_size_, 0);
64
- for (auto key: sketch) {
65
- if (!update_theta_sketch_alloc<A>::hash_search_or_insert(key, keys_.data(), lg_size_)) {
66
- throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
67
- }
68
- ++num_keys_;
69
- }
70
- if (num_keys_ != sketch.get_num_retained()) throw std::invalid_argument("num keys mismatch, possibly corrupted input sketch");
71
- } else { // intersection
72
- const uint32_t max_matches = std::min(num_keys_, sketch.get_num_retained());
73
- vector_u64<A> matched_keys(max_matches);
74
- uint32_t match_count = 0;
75
- uint32_t count = 0;
76
- for (auto key: sketch) {
77
- if (key < theta_) {
78
- if (update_theta_sketch_alloc<A>::hash_search(key, keys_.data(), lg_size_)) {
79
- if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
80
- matched_keys[match_count++] = key;
81
- }
82
- } else if (sketch.is_ordered()) {
83
- break; // early stop
84
- }
85
- ++count;
86
- }
87
- if (count > sketch.get_num_retained()) {
88
- throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
89
- } else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
90
- throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
91
- }
92
- if (match_count == 0) {
93
- keys_.resize(0);
94
- lg_size_ = 0;
95
- num_keys_ = 0;
96
- if (theta_ == theta_sketch_alloc<A>::MAX_THETA) is_empty_ = true;
97
- } else {
98
- const uint8_t lg_size = lg_size_from_count(match_count, update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
99
- if (lg_size != lg_size_) {
100
- lg_size_ = lg_size;
101
- keys_.resize(1 << lg_size_);
102
- }
103
- std::fill(keys_.begin(), keys_.end(), 0);
104
- for (uint32_t i = 0; i < match_count; i++) {
105
- update_theta_sketch_alloc<A>::hash_search_or_insert(matched_keys[i], keys_.data(), lg_size_);
106
- }
107
- num_keys_ = match_count;
108
- }
109
- }
31
+ template<typename SS>
32
+ void theta_intersection_alloc<A>::update(SS&& sketch) {
33
+ state_.update(std::forward<SS>(sketch));
110
34
  }
111
35
 
112
36
  template<typename A>
113
- compact_theta_sketch_alloc<A> theta_intersection_alloc<A>::get_result(bool ordered) const {
114
- if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
115
- vector_u64<A> keys(num_keys_);
116
- if (num_keys_ > 0) {
117
- std::copy_if(keys_.begin(), keys_.end(), keys.begin(), [](uint64_t key) { return key != 0; });
118
- if (ordered) std::sort(keys.begin(), keys.end());
119
- }
120
- return compact_theta_sketch_alloc<A>(is_empty_, theta_, std::move(keys), seed_hash_, ordered);
37
+ auto theta_intersection_alloc<A>::get_result(bool ordered) const -> CompactSketch {
38
+ return state_.get_result(ordered);
121
39
  }
122
40
 
123
41
  template<typename A>
124
42
  bool theta_intersection_alloc<A>::has_result() const {
125
- return is_valid_;
43
+ return state_.has_result();
126
44
  }
127
45
 
128
46
  } /* namespace datasketches */
@@ -17,28 +17,21 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #include <iostream>
20
+ #ifndef THETA_JACCARD_SIMILARITY_HPP_
21
+ #define THETA_JACCARD_SIMILARITY_HPP_
21
22
 
22
- #include <catch.hpp>
23
- #include <tuple_union.hpp>
24
-
25
- #include <theta_union_experimental.hpp>
23
+ #include "theta_jaccard_similarity_base.hpp"
24
+ #include "theta_union.hpp"
25
+ #include "theta_intersection.hpp"
26
26
 
27
27
  namespace datasketches {
28
28
 
29
- TEST_CASE("theta_union_exeperimental") {
30
- auto update_sketch1 = update_theta_sketch_experimental<>::builder().build();
31
- update_sketch1.update(1);
32
- update_sketch1.update(2);
33
-
34
- auto update_sketch2 = update_theta_sketch_experimental<>::builder().build();
35
- update_sketch2.update(1);
36
- update_sketch2.update(3);
29
+ template<typename Allocator = std::allocator<uint64_t>>
30
+ using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_alloc<Allocator>, theta_intersection_alloc<Allocator>, trivial_extract_key>;
37
31
 
38
- auto u = theta_union_experimental<>::builder().build();
39
- u.update(update_sketch1);
40
- u.update(update_sketch2);
41
- auto r = u.get_result();
42
- }
32
+ // alias with default allocator for convenience
33
+ using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
43
34
 
44
35
  } /* namespace datasketches */
36
+
37
+ # endif
@@ -17,19 +17,16 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #ifndef JACCARD_SIMILARITY_BASE_HPP_
21
- #define JACCARD_SIMILARITY_BASE_HPP_
20
+ #ifndef THETA_JACCARD_SIMILARITY_BASE_HPP_
21
+ #define THETA_JACCARD_SIMILARITY_BASE_HPP_
22
22
 
23
23
  #include <memory>
24
24
  #include <array>
25
25
 
26
- #include <theta_union_experimental.hpp>
27
- #include <theta_intersection_experimental.hpp>
28
- #include <tuple_union.hpp>
29
- #include <tuple_intersection.hpp>
30
- #include <bounds_on_ratios_in_theta_sketched_sets.hpp>
31
- #include <ceiling_power_of_2.hpp>
32
- #include <common_defs.hpp>
26
+ #include "theta_constants.hpp"
27
+ #include "bounds_on_ratios_in_theta_sketched_sets.hpp"
28
+ #include "ceiling_power_of_2.hpp"
29
+ #include "common_defs.hpp"
33
30
 
34
31
  namespace datasketches {
35
32
 
@@ -154,19 +151,6 @@ private:
154
151
 
155
152
  };
156
153
 
157
- template<typename Allocator>
158
- using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_experimental<Allocator>, theta_intersection_experimental<Allocator>, trivial_extract_key>;
159
-
160
- // alias with default allocator for convenience
161
- using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
162
-
163
- template<
164
- typename Summary,
165
- typename IntersectionPolicy,
166
- typename UnionPolicy = default_union_policy<Summary>,
167
- typename Allocator = std::allocator<Summary>>
168
- using tuple_jaccard_similarity = jaccard_similarity_base<tuple_union<Summary, UnionPolicy, Allocator>, tuple_intersection<Summary, IntersectionPolicy, Allocator>, pair_extract_key<uint64_t, Summary>>;
169
-
170
154
  } /* namespace datasketches */
171
155
 
172
156
  # endif
@@ -17,6 +17,9 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+ #ifndef THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
21
+ #define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
22
+
20
23
  #include <algorithm>
21
24
 
22
25
  #include "conditional_back_inserter.hpp"
@@ -78,3 +81,5 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
78
81
  }
79
82
 
80
83
  } /* namespace datasketches */
84
+
85
+ #endif
@@ -20,45 +20,29 @@
20
20
  #ifndef THETA_SKETCH_HPP_
21
21
  #define THETA_SKETCH_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
- #include <vector>
27
-
28
- #include "common_defs.hpp"
23
+ #include "theta_update_sketch_base.hpp"
29
24
 
30
25
  namespace datasketches {
31
26
 
32
- /*
33
- * author Alexander Saydakov
34
- * author Lee Rhodes
35
- * author Kevin Lang
36
- */
37
-
38
- // forward-declarations
39
- template<typename A> class theta_sketch_alloc;
40
- template<typename A> class update_theta_sketch_alloc;
41
- template<typename A> class compact_theta_sketch_alloc;
42
- template<typename A> class theta_union_alloc;
43
- template<typename A> class theta_intersection_alloc;
44
- template<typename A> class theta_a_not_b_alloc;
45
-
46
- // for serialization as raw bytes
47
- template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
48
- template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
49
-
50
- template<typename A>
27
+ template<typename Allocator = std::allocator<uint64_t>>
51
28
  class theta_sketch_alloc {
52
29
  public:
53
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
54
- static const uint8_t SERIAL_VERSION = 3;
30
+ using Entry = uint64_t;
31
+ using ExtractKey = trivial_extract_key;
32
+ using iterator = theta_iterator<Entry, ExtractKey>;
33
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
55
34
 
56
35
  virtual ~theta_sketch_alloc() = default;
57
36
 
37
+ /**
38
+ * @return allocator
39
+ */
40
+ virtual Allocator get_allocator() const = 0;
41
+
58
42
  /**
59
43
  * @return true if this sketch represents an empty set (not the same as no retained entries!)
60
44
  */
61
- bool is_empty() const;
45
+ virtual bool is_empty() const = 0;
62
46
 
63
47
  /**
64
48
  * @return estimate of the distinct count of the input stream
@@ -96,13 +80,16 @@ public:
96
80
  /**
97
81
  * @return theta as a positive integer between 0 and LLONG_MAX
98
82
  */
99
- uint64_t get_theta64() const;
83
+ virtual uint64_t get_theta64() const = 0;
100
84
 
101
85
  /**
102
86
  * @return the number of retained entries in the sketch
103
87
  */
104
88
  virtual uint32_t get_num_retained() const = 0;
105
89
 
90
+ /**
91
+ * @return hash of the seed that was used to hash the input
92
+ */
106
93
  virtual uint16_t get_seed_hash() const = 0;
107
94
 
108
95
  /**
@@ -111,109 +98,82 @@ public:
111
98
  virtual bool is_ordered() const = 0;
112
99
 
113
100
  /**
114
- * Writes a human-readable summary of this sketch to a given stream
101
+ * Provides a human-readable summary of this sketch as a string
115
102
  * @param print_items if true include the list of items retained by the sketch
103
+ * @return sketch summary as a string
116
104
  */
117
- virtual string<A> to_string(bool print_items = false) const = 0;
118
-
119
- /**
120
- * This method serializes the sketch into a given stream in a binary form
121
- * @param os output stream
122
- */
123
- virtual void serialize(std::ostream& os) const = 0;
124
-
125
- // This is a convenience alias for users
126
- // The type returned by the following serialize method
127
- typedef vector_u8<A> vector_bytes;
105
+ virtual string<Allocator> to_string(bool print_items = false) const;
128
106
 
129
107
  /**
130
- * This method serializes the sketch as a vector of bytes.
131
- * An optional header can be reserved in front of the sketch.
132
- * It is an uninitialized space of a given size.
133
- * This header is used in Datasketches PostgreSQL extension.
134
- * @param header_size_bytes space to reserve in front of the sketch
135
- */
136
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
137
-
138
- // This is a convenience alias for users
139
- // The type returned by the following deserialize methods
140
- // It is not possible to return instances of an abstract type, so this has to be a pointer
141
- typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
142
-
143
- /**
144
- * This method deserializes a sketch from a given stream.
145
- * @param is input stream
146
- * @param seed the seed for the hash function that was used to create the sketch
147
- * @return an instance of a sketch as a unique_ptr
108
+ * Iterator over hash values in this sketch.
109
+ * @return begin iterator
148
110
  */
149
- static unique_ptr deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
111
+ virtual iterator begin() = 0;
150
112
 
151
113
  /**
152
- * This method deserializes a sketch from a given array of bytes.
153
- * @param bytes pointer to the array of bytes
154
- * @param size the size of the array
155
- * @param seed the seed for the hash function that was used to create the sketch
156
- * @return an instance of the sketch
114
+ * Iterator pointing past the valid range.
115
+ * Not to be incremented or dereferenced.
116
+ * @return end iterator
157
117
  */
158
- static unique_ptr deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
159
-
160
- class const_iterator;
118
+ virtual iterator end() = 0;
161
119
 
162
120
  /**
163
- * Iterator over hash values in this sketch.
121
+ * Const iterator over hash values in this sketch.
164
122
  * @return begin iterator
165
123
  */
166
124
  virtual const_iterator begin() const = 0;
167
125
 
168
126
  /**
169
- * Iterator pointing past the valid range.
127
+ * Const iterator pointing past the valid range.
170
128
  * Not to be incremented or dereferenced.
171
129
  * @return end iterator
172
130
  */
173
131
  virtual const_iterator end() const = 0;
174
132
 
175
133
  protected:
176
- enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
177
-
178
- bool is_empty_;
179
- uint64_t theta_;
180
-
181
- theta_sketch_alloc(bool is_empty, uint64_t theta);
182
-
183
- static uint16_t get_seed_hash(uint64_t seed);
184
-
185
- static void check_sketch_type(uint8_t actual, uint8_t expected);
186
- static void check_serial_version(uint8_t actual, uint8_t expected);
187
- static void check_seed_hash(uint16_t actual, uint16_t expected);
188
-
189
- friend theta_intersection_alloc<A>;
190
- friend theta_a_not_b_alloc<A>;
134
+ using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
135
+ virtual void print_specifics(ostrstream& os) const = 0;
191
136
  };
192
137
 
193
- // update sketch
194
-
195
- template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
196
- template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
138
+ // forward declaration
139
+ template<typename A> class compact_theta_sketch_alloc;
197
140
 
198
- template<typename A>
199
- class update_theta_sketch_alloc: public theta_sketch_alloc<A> {
141
+ template<typename Allocator = std::allocator<uint64_t>>
142
+ class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
200
143
  public:
201
- class builder;
202
- enum resize_factor { X1, X2, X4, X8 };
203
- static const uint8_t SKETCH_TYPE = 2;
144
+ using Base = theta_sketch_alloc<Allocator>;
145
+ using Entry = typename Base::Entry;
146
+ using ExtractKey = typename Base::ExtractKey;
147
+ using iterator = typename Base::iterator;
148
+ using const_iterator = typename Base::const_iterator;
149
+ using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
150
+ using resize_factor = typename theta_table::resize_factor;
204
151
 
205
152
  // No constructor here. Use builder instead.
153
+ class builder;
206
154
 
155
+ update_theta_sketch_alloc(const update_theta_sketch_alloc&) = default;
156
+ update_theta_sketch_alloc(update_theta_sketch_alloc&&) noexcept = default;
207
157
  virtual ~update_theta_sketch_alloc() = default;
158
+ update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc&) = default;
159
+ update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&&) = default;
208
160
 
209
- virtual uint32_t get_num_retained() const;
210
- virtual uint16_t get_seed_hash() const;
161
+ virtual Allocator get_allocator() const;
162
+ virtual bool is_empty() const;
211
163
  virtual bool is_ordered() const;
212
- virtual string<A> to_string(bool print_items = false) const;
213
- virtual void serialize(std::ostream& os) const;
214
- typedef vector_u8<A> vector_bytes; // alias for users
215
- // header space is reserved, but not initialized
216
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
164
+ virtual uint16_t get_seed_hash() const;
165
+ virtual uint64_t get_theta64() const;
166
+ virtual uint32_t get_num_retained() const;
167
+
168
+ /**
169
+ * @return configured nominal number of entries in the sketch
170
+ */
171
+ uint8_t get_lg_k() const;
172
+
173
+ /**
174
+ * @return configured resize factor of the sketch
175
+ */
176
+ resize_factor get_rf() const;
217
177
 
218
178
  /**
219
179
  * Update this sketch with a given string.
@@ -302,7 +262,7 @@ public:
302
262
  * @param data pointer to the data
303
263
  * @param length of the data in bytes
304
264
  */
305
- void update(const void* data, unsigned length);
265
+ void update(const void* data, size_t length);
306
266
 
307
267
  /**
308
268
  * Remove retained entries in excess of the nominal size k (if any)
@@ -314,105 +274,85 @@ public:
314
274
  * @param ordered optional flag to specify if ordered sketch should be produced
315
275
  * @return compact sketch
316
276
  */
317
- compact_theta_sketch_alloc<A> compact(bool ordered = true) const;
318
-
319
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
320
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
321
-
322
- /**
323
- * This method deserializes a sketch from a given stream.
324
- * @param is input stream
325
- * @param seed the seed for the hash function that was used to create the sketch
326
- * @return an instance of a sketch
327
- */
328
- static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
277
+ compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
329
278
 
330
- /**
331
- * This method deserializes a sketch from a given array of bytes.
332
- * @param bytes pointer to the array of bytes
333
- * @param size the size of the array
334
- * @param seed the seed for the hash function that was used to create the sketch
335
- * @return an instance of the sketch
336
- */
337
- static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
279
+ virtual iterator begin();
280
+ virtual iterator end();
281
+ virtual const_iterator begin() const;
282
+ virtual const_iterator end() const;
338
283
 
339
284
  private:
340
- // resize threshold = 0.5 tuned for speed
341
- static constexpr double RESIZE_THRESHOLD = 0.5;
342
- // hash table rebuild threshold = 15/16
343
- static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
344
-
345
- static constexpr uint8_t STRIDE_HASH_BITS = 7;
346
- static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
347
-
348
- uint8_t lg_cur_size_;
349
- uint8_t lg_nom_size_;
350
- vector_u64<A> keys_;
351
- uint32_t num_keys_;
352
- resize_factor rf_;
353
- float p_;
354
- uint64_t seed_;
355
- uint32_t capacity_;
285
+ theta_table table_;
356
286
 
357
287
  // for builder
358
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
359
-
360
- // for deserialize
361
- update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
288
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
289
+ uint64_t seed, const Allocator& allocator);
362
290
 
363
- void resize();
364
- void rebuild();
365
-
366
- friend theta_union_alloc<A>;
367
- void internal_update(uint64_t hash);
368
-
369
- friend theta_intersection_alloc<A>;
370
- friend theta_a_not_b_alloc<A>;
371
- static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
372
- static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
373
- static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
374
- static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
375
-
376
- friend theta_sketch_alloc<A>;
377
- static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
378
- static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
291
+ using ostrstream = typename Base::ostrstream;
292
+ virtual void print_specifics(ostrstream& os) const;
379
293
  };
380
294
 
381
295
  // compact sketch
382
296
 
383
- template<typename A>
384
- class compact_theta_sketch_alloc: public theta_sketch_alloc<A> {
297
+ template<typename Allocator = std::allocator<uint64_t>>
298
+ class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
385
299
  public:
300
+ using Base = theta_sketch_alloc<Allocator>;
301
+ using iterator = typename Base::iterator;
302
+ using const_iterator = typename Base::const_iterator;
303
+ using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
304
+ using vector_bytes = std::vector<uint8_t, AllocBytes>;
305
+
306
+ static const uint8_t SERIAL_VERSION = 3;
386
307
  static const uint8_t SKETCH_TYPE = 3;
387
308
 
388
- // No constructor here.
389
309
  // Instances of this type can be obtained:
390
- // - by compacting an update_theta_sketch
310
+ // - by compacting an update_theta_sketch_alloc
391
311
  // - as a result of a set operation
392
312
  // - by deserializing a previously serialized compact sketch
393
313
 
394
- compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered);
314
+ compact_theta_sketch_alloc(const Base& other, bool ordered);
315
+ compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
316
+ compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
395
317
  virtual ~compact_theta_sketch_alloc() = default;
318
+ compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc&) = default;
319
+ compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&&) = default;
396
320
 
321
+ virtual Allocator get_allocator() const;
322
+ virtual bool is_empty() const;
323
+ virtual bool is_ordered() const;
324
+ virtual uint64_t get_theta64() const;
397
325
  virtual uint32_t get_num_retained() const;
398
326
  virtual uint16_t get_seed_hash() const;
399
- virtual bool is_ordered() const;
400
- virtual string<A> to_string(bool print_items = false) const;
401
- virtual void serialize(std::ostream& os) const;
402
- typedef vector_u8<A> vector_bytes; // alias for users
403
- // header space is reserved, but not initialized
404
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
405
327
 
406
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
407
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
328
+ /**
329
+ * This method serializes the sketch into a given stream in a binary form
330
+ * @param os output stream
331
+ */
332
+ void serialize(std::ostream& os) const;
333
+
334
+ /**
335
+ * This method serializes the sketch as a vector of bytes.
336
+ * An optional header can be reserved in front of the sketch.
337
+ * It is an uninitialized space of a given size.
338
+ * This header is used in Datasketches PostgreSQL extension.
339
+ * @param header_size_bytes space to reserve in front of the sketch
340
+ */
341
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
342
+
343
+ virtual iterator begin();
344
+ virtual iterator end();
345
+ virtual const_iterator begin() const;
346
+ virtual const_iterator end() const;
408
347
 
409
348
  /**
410
349
  * This method deserializes a sketch from a given stream.
411
350
  * @param is input stream
412
351
  * @param seed the seed for the hash function that was used to create the sketch
413
- * @return an instance of a sketch
352
+ * @return an instance of the sketch
414
353
  */
415
- static compact_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
354
+ static compact_theta_sketch_alloc deserialize(std::istream& is,
355
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
416
356
 
417
357
  /**
418
358
  * This method deserializes a sketch from a given array of bytes.
@@ -421,110 +361,36 @@ public:
421
361
  * @param seed the seed for the hash function that was used to create the sketch
422
362
  * @return an instance of the sketch
423
363
  */
424
- static compact_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
364
+ static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
365
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
366
+
367
+ // for internal use
368
+ compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
425
369
 
426
370
  private:
427
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
371
+ enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
428
372
 
429
- vector_u64<A> keys_;
430
- uint16_t seed_hash_;
373
+ bool is_empty_;
431
374
  bool is_ordered_;
375
+ uint16_t seed_hash_;
376
+ uint64_t theta_;
377
+ std::vector<uint64_t, Allocator> entries_;
432
378
 
433
- friend theta_sketch_alloc<A>;
434
- friend update_theta_sketch_alloc<A>;
435
- friend theta_union_alloc<A>;
436
- friend theta_intersection_alloc<A>;
437
- friend theta_a_not_b_alloc<A>;
438
- compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
439
- static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
440
- static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
441
- };
442
-
443
- // builder
444
-
445
- template<typename A>
446
- class update_theta_sketch_alloc<A>::builder {
447
- public:
448
- static const uint8_t MIN_LG_K = 5;
449
- static const uint8_t DEFAULT_LG_K = 12;
450
- static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
451
-
452
- /**
453
- * Creates and instance of the builder with default parameters.
454
- */
455
- builder();
456
-
457
- /**
458
- * Set log2(k), where k is a nominal number of entries in the sketch
459
- * @param lg_k base 2 logarithm of nominal number of entries
460
- * @return this builder
461
- */
462
- builder& set_lg_k(uint8_t lg_k);
463
-
464
- /**
465
- * Set resize factor for the internal hash table (defaults to 8)
466
- * @param rf resize factor
467
- * @return this builder
468
- */
469
- builder& set_resize_factor(resize_factor rf);
470
-
471
- /**
472
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
473
- * all entries until it reaches the limit, at which point it goes into the estimation mode
474
- * and reduces the effective sampling probability (theta) as necessary.
475
- * @param p sampling probability
476
- * @return this builder
477
- */
478
- builder& set_p(float p);
479
-
480
- /**
481
- * Set the seed for the hash function. Should be used carefully if needed.
482
- * Sketches produced with different seed are not compatible
483
- * and cannot be mixed in set operations.
484
- * @param seed hash seed
485
- * @return this builder
486
- */
487
- builder& set_seed(uint64_t seed);
488
-
489
- /**
490
- * This is to create an instance of the sketch with predefined parameters.
491
- * @return and instance of the sketch
492
- */
493
- update_theta_sketch_alloc<A> build() const;
494
-
495
- private:
496
- uint8_t lg_k_;
497
- resize_factor rf_;
498
- float p_;
499
- uint64_t seed_;
500
-
501
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
379
+ using ostrstream = typename Base::ostrstream;
380
+ virtual void print_specifics(ostrstream& os) const;
502
381
  };
503
382
 
504
- // iterator
505
- template<typename A>
506
- class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
383
+ template<typename Allocator>
384
+ class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
507
385
  public:
508
- const_iterator& operator++();
509
- const_iterator operator++(int);
510
- bool operator==(const const_iterator& other) const;
511
- bool operator!=(const const_iterator& other) const;
512
- uint64_t operator*() const;
513
-
514
- private:
515
- const uint64_t* keys_;
516
- uint32_t size_;
517
- uint32_t index_;
518
- const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
519
- friend class update_theta_sketch_alloc<A>;
520
- friend class compact_theta_sketch_alloc<A>;
386
+ builder(const Allocator& allocator = Allocator());
387
+ update_theta_sketch_alloc build() const;
521
388
  };
522
389
 
523
-
524
390
  // aliases with default allocator for convenience
525
- typedef theta_sketch_alloc<std::allocator<void>> theta_sketch;
526
- typedef update_theta_sketch_alloc<std::allocator<void>> update_theta_sketch;
527
- typedef compact_theta_sketch_alloc<std::allocator<void>> compact_theta_sketch;
391
+ using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
392
+ using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
393
+ using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
528
394
 
529
395
  } /* namespace datasketches */
530
396