datasketches 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -20,6 +20,8 @@
20
20
  #ifndef THETA_CONSTANTS_HPP_
21
21
  #define THETA_CONSTANTS_HPP_
22
22
 
23
+ #include <climits>
24
+
23
25
  namespace datasketches {
24
26
 
25
27
  namespace theta_constants {
@@ -20,29 +20,28 @@
20
20
  #ifndef THETA_INTERSECTION_HPP_
21
21
  #define THETA_INTERSECTION_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
-
27
23
  #include "theta_sketch.hpp"
28
- #include "common_defs.hpp"
24
+ #include "theta_intersection_base.hpp"
29
25
 
30
26
  namespace datasketches {
31
27
 
32
- /*
33
- * author Alexander Saydakov
34
- * author Lee Rhodes
35
- * author Kevin Lang
36
- */
37
-
38
- template<typename A>
28
+ template<typename Allocator = std::allocator<uint64_t>>
39
29
  class theta_intersection_alloc {
40
30
  public:
41
- /**
42
- * Creates an instance of the intersection with a given hash seed.
43
- * @param seed hash seed
44
- */
45
- explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED);
31
+ using Entry = uint64_t;
32
+ using ExtractKey = trivial_extract_key;
33
+ using Sketch = theta_sketch_alloc<Allocator>;
34
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
35
+
36
+ struct pass_through_policy {
37
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
+ unused(incoming_entry);
39
+ return internal_entry;
40
+ }
41
+ };
42
+ using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
43
+
44
+ explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
46
45
 
47
46
  /**
48
47
  * Updates the intersection with a given sketch.
@@ -50,7 +49,8 @@ public:
50
49
  * can reduce the current set to leave the overlapping subset only.
51
50
  * @param sketch represents input set for the intersection
52
51
  */
53
- void update(const theta_sketch_alloc<A>& sketch);
52
+ template<typename FwdSketch>
53
+ void update(FwdSketch&& sketch);
54
54
 
55
55
  /**
56
56
  * Produces a copy of the current state of the intersection.
@@ -59,7 +59,7 @@ public:
59
59
  * @param ordered optional flag to specify if ordered sketch should be produced
60
60
  * @return the result of the intersection
61
61
  */
62
- compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
62
+ CompactSketch get_result(bool ordered = true) const;
63
63
 
64
64
  /**
65
65
  * Returns true if the state of the intersection is defined (not infinite "universe").
@@ -68,21 +68,14 @@ public:
68
68
  bool has_result() const;
69
69
 
70
70
  private:
71
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
72
- bool is_valid_;
73
- bool is_empty_;
74
- uint64_t theta_;
75
- uint8_t lg_size_;
76
- vector_u64<A> keys_;
77
- uint32_t num_keys_;
78
- uint16_t seed_hash_;
71
+ State state_;
79
72
  };
80
73
 
81
74
  // alias with default allocator for convenience
82
- typedef theta_intersection_alloc<std::allocator<void>> theta_intersection;
75
+ using theta_intersection = theta_intersection_alloc<std::allocator<uint64_t>>;
83
76
 
84
77
  } /* namespace datasketches */
85
78
 
86
79
  #include "theta_intersection_impl.hpp"
87
80
 
88
- # endif
81
+ #endif
@@ -20,109 +20,27 @@
20
20
  #ifndef THETA_INTERSECTION_IMPL_HPP_
21
21
  #define THETA_INTERSECTION_IMPL_HPP_
22
22
 
23
- #include <algorithm>
24
-
25
23
  namespace datasketches {
26
24
 
27
- /*
28
- * author Alexander Saydakov
29
- * author Lee Rhodes
30
- * author Kevin Lang
31
- */
32
-
33
25
  template<typename A>
34
- theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed):
35
- is_valid_(false),
36
- is_empty_(false),
37
- theta_(theta_sketch_alloc<A>::MAX_THETA),
38
- lg_size_(0),
39
- keys_(),
40
- num_keys_(0),
41
- seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
26
+ theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
27
+ state_(seed, pass_through_policy(), allocator)
42
28
  {}
43
29
 
44
30
  template<typename A>
45
- void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
46
- if (is_empty_) return;
47
- if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
48
- is_empty_ |= sketch.is_empty();
49
- theta_ = std::min(theta_, sketch.get_theta64());
50
- if (is_valid_ && num_keys_ == 0) return;
51
- if (sketch.get_num_retained() == 0) {
52
- is_valid_ = true;
53
- if (keys_.size() > 0) {
54
- keys_.resize(0);
55
- lg_size_ = 0;
56
- num_keys_ = 0;
57
- }
58
- return;
59
- }
60
- if (!is_valid_) { // first update, clone incoming sketch
61
- is_valid_ = true;
62
- lg_size_ = lg_size_from_count(sketch.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
63
- keys_.resize(1 << lg_size_, 0);
64
- for (auto key: sketch) {
65
- if (!update_theta_sketch_alloc<A>::hash_search_or_insert(key, keys_.data(), lg_size_)) {
66
- throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
67
- }
68
- ++num_keys_;
69
- }
70
- if (num_keys_ != sketch.get_num_retained()) throw std::invalid_argument("num keys mismatch, possibly corrupted input sketch");
71
- } else { // intersection
72
- const uint32_t max_matches = std::min(num_keys_, sketch.get_num_retained());
73
- vector_u64<A> matched_keys(max_matches);
74
- uint32_t match_count = 0;
75
- uint32_t count = 0;
76
- for (auto key: sketch) {
77
- if (key < theta_) {
78
- if (update_theta_sketch_alloc<A>::hash_search(key, keys_.data(), lg_size_)) {
79
- if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
80
- matched_keys[match_count++] = key;
81
- }
82
- } else if (sketch.is_ordered()) {
83
- break; // early stop
84
- }
85
- ++count;
86
- }
87
- if (count > sketch.get_num_retained()) {
88
- throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
89
- } else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
90
- throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
91
- }
92
- if (match_count == 0) {
93
- keys_.resize(0);
94
- lg_size_ = 0;
95
- num_keys_ = 0;
96
- if (theta_ == theta_sketch_alloc<A>::MAX_THETA) is_empty_ = true;
97
- } else {
98
- const uint8_t lg_size = lg_size_from_count(match_count, update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
99
- if (lg_size != lg_size_) {
100
- lg_size_ = lg_size;
101
- keys_.resize(1 << lg_size_);
102
- }
103
- std::fill(keys_.begin(), keys_.end(), 0);
104
- for (uint32_t i = 0; i < match_count; i++) {
105
- update_theta_sketch_alloc<A>::hash_search_or_insert(matched_keys[i], keys_.data(), lg_size_);
106
- }
107
- num_keys_ = match_count;
108
- }
109
- }
31
+ template<typename SS>
32
+ void theta_intersection_alloc<A>::update(SS&& sketch) {
33
+ state_.update(std::forward<SS>(sketch));
110
34
  }
111
35
 
112
36
  template<typename A>
113
- compact_theta_sketch_alloc<A> theta_intersection_alloc<A>::get_result(bool ordered) const {
114
- if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
115
- vector_u64<A> keys(num_keys_);
116
- if (num_keys_ > 0) {
117
- std::copy_if(keys_.begin(), keys_.end(), keys.begin(), [](uint64_t key) { return key != 0; });
118
- if (ordered) std::sort(keys.begin(), keys.end());
119
- }
120
- return compact_theta_sketch_alloc<A>(is_empty_, theta_, std::move(keys), seed_hash_, ordered);
37
+ auto theta_intersection_alloc<A>::get_result(bool ordered) const -> CompactSketch {
38
+ return state_.get_result(ordered);
121
39
  }
122
40
 
123
41
  template<typename A>
124
42
  bool theta_intersection_alloc<A>::has_result() const {
125
- return is_valid_;
43
+ return state_.has_result();
126
44
  }
127
45
 
128
46
  } /* namespace datasketches */
@@ -17,28 +17,21 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #include <iostream>
20
+ #ifndef THETA_JACCARD_SIMILARITY_HPP_
21
+ #define THETA_JACCARD_SIMILARITY_HPP_
21
22
 
22
- #include <catch.hpp>
23
- #include <tuple_union.hpp>
24
-
25
- #include <theta_union_experimental.hpp>
23
+ #include "theta_jaccard_similarity_base.hpp"
24
+ #include "theta_union.hpp"
25
+ #include "theta_intersection.hpp"
26
26
 
27
27
  namespace datasketches {
28
28
 
29
- TEST_CASE("theta_union_exeperimental") {
30
- auto update_sketch1 = update_theta_sketch_experimental<>::builder().build();
31
- update_sketch1.update(1);
32
- update_sketch1.update(2);
33
-
34
- auto update_sketch2 = update_theta_sketch_experimental<>::builder().build();
35
- update_sketch2.update(1);
36
- update_sketch2.update(3);
29
+ template<typename Allocator = std::allocator<uint64_t>>
30
+ using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_alloc<Allocator>, theta_intersection_alloc<Allocator>, trivial_extract_key>;
37
31
 
38
- auto u = theta_union_experimental<>::builder().build();
39
- u.update(update_sketch1);
40
- u.update(update_sketch2);
41
- auto r = u.get_result();
42
- }
32
+ // alias with default allocator for convenience
33
+ using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
43
34
 
44
35
  } /* namespace datasketches */
36
+
37
+ # endif
@@ -17,19 +17,16 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #ifndef JACCARD_SIMILARITY_BASE_HPP_
21
- #define JACCARD_SIMILARITY_BASE_HPP_
20
+ #ifndef THETA_JACCARD_SIMILARITY_BASE_HPP_
21
+ #define THETA_JACCARD_SIMILARITY_BASE_HPP_
22
22
 
23
23
  #include <memory>
24
24
  #include <array>
25
25
 
26
- #include <theta_union_experimental.hpp>
27
- #include <theta_intersection_experimental.hpp>
28
- #include <tuple_union.hpp>
29
- #include <tuple_intersection.hpp>
30
- #include <bounds_on_ratios_in_theta_sketched_sets.hpp>
31
- #include <ceiling_power_of_2.hpp>
32
- #include <common_defs.hpp>
26
+ #include "theta_constants.hpp"
27
+ #include "bounds_on_ratios_in_theta_sketched_sets.hpp"
28
+ #include "ceiling_power_of_2.hpp"
29
+ #include "common_defs.hpp"
33
30
 
34
31
  namespace datasketches {
35
32
 
@@ -154,19 +151,6 @@ private:
154
151
 
155
152
  };
156
153
 
157
- template<typename Allocator>
158
- using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_experimental<Allocator>, theta_intersection_experimental<Allocator>, trivial_extract_key>;
159
-
160
- // alias with default allocator for convenience
161
- using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
162
-
163
- template<
164
- typename Summary,
165
- typename IntersectionPolicy,
166
- typename UnionPolicy = default_union_policy<Summary>,
167
- typename Allocator = std::allocator<Summary>>
168
- using tuple_jaccard_similarity = jaccard_similarity_base<tuple_union<Summary, UnionPolicy, Allocator>, tuple_intersection<Summary, IntersectionPolicy, Allocator>, pair_extract_key<uint64_t, Summary>>;
169
-
170
154
  } /* namespace datasketches */
171
155
 
172
156
  # endif
@@ -17,6 +17,9 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+ #ifndef THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
21
+ #define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
22
+
20
23
  #include <algorithm>
21
24
 
22
25
  #include "conditional_back_inserter.hpp"
@@ -78,3 +81,5 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
78
81
  }
79
82
 
80
83
  } /* namespace datasketches */
84
+
85
+ #endif
@@ -20,45 +20,29 @@
20
20
  #ifndef THETA_SKETCH_HPP_
21
21
  #define THETA_SKETCH_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
- #include <vector>
27
-
28
- #include "common_defs.hpp"
23
+ #include "theta_update_sketch_base.hpp"
29
24
 
30
25
  namespace datasketches {
31
26
 
32
- /*
33
- * author Alexander Saydakov
34
- * author Lee Rhodes
35
- * author Kevin Lang
36
- */
37
-
38
- // forward-declarations
39
- template<typename A> class theta_sketch_alloc;
40
- template<typename A> class update_theta_sketch_alloc;
41
- template<typename A> class compact_theta_sketch_alloc;
42
- template<typename A> class theta_union_alloc;
43
- template<typename A> class theta_intersection_alloc;
44
- template<typename A> class theta_a_not_b_alloc;
45
-
46
- // for serialization as raw bytes
47
- template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
48
- template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
49
-
50
- template<typename A>
27
+ template<typename Allocator = std::allocator<uint64_t>>
51
28
  class theta_sketch_alloc {
52
29
  public:
53
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
54
- static const uint8_t SERIAL_VERSION = 3;
30
+ using Entry = uint64_t;
31
+ using ExtractKey = trivial_extract_key;
32
+ using iterator = theta_iterator<Entry, ExtractKey>;
33
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
55
34
 
56
35
  virtual ~theta_sketch_alloc() = default;
57
36
 
37
+ /**
38
+ * @return allocator
39
+ */
40
+ virtual Allocator get_allocator() const = 0;
41
+
58
42
  /**
59
43
  * @return true if this sketch represents an empty set (not the same as no retained entries!)
60
44
  */
61
- bool is_empty() const;
45
+ virtual bool is_empty() const = 0;
62
46
 
63
47
  /**
64
48
  * @return estimate of the distinct count of the input stream
@@ -96,13 +80,16 @@ public:
96
80
  /**
97
81
  * @return theta as a positive integer between 0 and LLONG_MAX
98
82
  */
99
- uint64_t get_theta64() const;
83
+ virtual uint64_t get_theta64() const = 0;
100
84
 
101
85
  /**
102
86
  * @return the number of retained entries in the sketch
103
87
  */
104
88
  virtual uint32_t get_num_retained() const = 0;
105
89
 
90
+ /**
91
+ * @return hash of the seed that was used to hash the input
92
+ */
106
93
  virtual uint16_t get_seed_hash() const = 0;
107
94
 
108
95
  /**
@@ -111,109 +98,82 @@ public:
111
98
  virtual bool is_ordered() const = 0;
112
99
 
113
100
  /**
114
- * Writes a human-readable summary of this sketch to a given stream
101
+ * Provides a human-readable summary of this sketch as a string
115
102
  * @param print_items if true include the list of items retained by the sketch
103
+ * @return sketch summary as a string
116
104
  */
117
- virtual string<A> to_string(bool print_items = false) const = 0;
118
-
119
- /**
120
- * This method serializes the sketch into a given stream in a binary form
121
- * @param os output stream
122
- */
123
- virtual void serialize(std::ostream& os) const = 0;
124
-
125
- // This is a convenience alias for users
126
- // The type returned by the following serialize method
127
- typedef vector_u8<A> vector_bytes;
105
+ virtual string<Allocator> to_string(bool print_items = false) const;
128
106
 
129
107
  /**
130
- * This method serializes the sketch as a vector of bytes.
131
- * An optional header can be reserved in front of the sketch.
132
- * It is an uninitialized space of a given size.
133
- * This header is used in Datasketches PostgreSQL extension.
134
- * @param header_size_bytes space to reserve in front of the sketch
135
- */
136
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
137
-
138
- // This is a convenience alias for users
139
- // The type returned by the following deserialize methods
140
- // It is not possible to return instances of an abstract type, so this has to be a pointer
141
- typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
142
-
143
- /**
144
- * This method deserializes a sketch from a given stream.
145
- * @param is input stream
146
- * @param seed the seed for the hash function that was used to create the sketch
147
- * @return an instance of a sketch as a unique_ptr
108
+ * Iterator over hash values in this sketch.
109
+ * @return begin iterator
148
110
  */
149
- static unique_ptr deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
111
+ virtual iterator begin() = 0;
150
112
 
151
113
  /**
152
- * This method deserializes a sketch from a given array of bytes.
153
- * @param bytes pointer to the array of bytes
154
- * @param size the size of the array
155
- * @param seed the seed for the hash function that was used to create the sketch
156
- * @return an instance of the sketch
114
+ * Iterator pointing past the valid range.
115
+ * Not to be incremented or dereferenced.
116
+ * @return end iterator
157
117
  */
158
- static unique_ptr deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
159
-
160
- class const_iterator;
118
+ virtual iterator end() = 0;
161
119
 
162
120
  /**
163
- * Iterator over hash values in this sketch.
121
+ * Const iterator over hash values in this sketch.
164
122
  * @return begin iterator
165
123
  */
166
124
  virtual const_iterator begin() const = 0;
167
125
 
168
126
  /**
169
- * Iterator pointing past the valid range.
127
+ * Const iterator pointing past the valid range.
170
128
  * Not to be incremented or dereferenced.
171
129
  * @return end iterator
172
130
  */
173
131
  virtual const_iterator end() const = 0;
174
132
 
175
133
  protected:
176
- enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
177
-
178
- bool is_empty_;
179
- uint64_t theta_;
180
-
181
- theta_sketch_alloc(bool is_empty, uint64_t theta);
182
-
183
- static uint16_t get_seed_hash(uint64_t seed);
184
-
185
- static void check_sketch_type(uint8_t actual, uint8_t expected);
186
- static void check_serial_version(uint8_t actual, uint8_t expected);
187
- static void check_seed_hash(uint16_t actual, uint16_t expected);
188
-
189
- friend theta_intersection_alloc<A>;
190
- friend theta_a_not_b_alloc<A>;
134
+ using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
135
+ virtual void print_specifics(ostrstream& os) const = 0;
191
136
  };
192
137
 
193
- // update sketch
194
-
195
- template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
196
- template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
138
+ // forward declaration
139
+ template<typename A> class compact_theta_sketch_alloc;
197
140
 
198
- template<typename A>
199
- class update_theta_sketch_alloc: public theta_sketch_alloc<A> {
141
+ template<typename Allocator = std::allocator<uint64_t>>
142
+ class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
200
143
  public:
201
- class builder;
202
- enum resize_factor { X1, X2, X4, X8 };
203
- static const uint8_t SKETCH_TYPE = 2;
144
+ using Base = theta_sketch_alloc<Allocator>;
145
+ using Entry = typename Base::Entry;
146
+ using ExtractKey = typename Base::ExtractKey;
147
+ using iterator = typename Base::iterator;
148
+ using const_iterator = typename Base::const_iterator;
149
+ using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
150
+ using resize_factor = typename theta_table::resize_factor;
204
151
 
205
152
  // No constructor here. Use builder instead.
153
+ class builder;
206
154
 
155
+ update_theta_sketch_alloc(const update_theta_sketch_alloc&) = default;
156
+ update_theta_sketch_alloc(update_theta_sketch_alloc&&) noexcept = default;
207
157
  virtual ~update_theta_sketch_alloc() = default;
158
+ update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc&) = default;
159
+ update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&&) = default;
208
160
 
209
- virtual uint32_t get_num_retained() const;
210
- virtual uint16_t get_seed_hash() const;
161
+ virtual Allocator get_allocator() const;
162
+ virtual bool is_empty() const;
211
163
  virtual bool is_ordered() const;
212
- virtual string<A> to_string(bool print_items = false) const;
213
- virtual void serialize(std::ostream& os) const;
214
- typedef vector_u8<A> vector_bytes; // alias for users
215
- // header space is reserved, but not initialized
216
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
164
+ virtual uint16_t get_seed_hash() const;
165
+ virtual uint64_t get_theta64() const;
166
+ virtual uint32_t get_num_retained() const;
167
+
168
+ /**
169
+ * @return configured nominal number of entries in the sketch
170
+ */
171
+ uint8_t get_lg_k() const;
172
+
173
+ /**
174
+ * @return configured resize factor of the sketch
175
+ */
176
+ resize_factor get_rf() const;
217
177
 
218
178
  /**
219
179
  * Update this sketch with a given string.
@@ -302,7 +262,7 @@ public:
302
262
  * @param data pointer to the data
303
263
  * @param length of the data in bytes
304
264
  */
305
- void update(const void* data, unsigned length);
265
+ void update(const void* data, size_t length);
306
266
 
307
267
  /**
308
268
  * Remove retained entries in excess of the nominal size k (if any)
@@ -314,105 +274,85 @@ public:
314
274
  * @param ordered optional flag to specify if ordered sketch should be produced
315
275
  * @return compact sketch
316
276
  */
317
- compact_theta_sketch_alloc<A> compact(bool ordered = true) const;
318
-
319
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
320
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
321
-
322
- /**
323
- * This method deserializes a sketch from a given stream.
324
- * @param is input stream
325
- * @param seed the seed for the hash function that was used to create the sketch
326
- * @return an instance of a sketch
327
- */
328
- static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
277
+ compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
329
278
 
330
- /**
331
- * This method deserializes a sketch from a given array of bytes.
332
- * @param bytes pointer to the array of bytes
333
- * @param size the size of the array
334
- * @param seed the seed for the hash function that was used to create the sketch
335
- * @return an instance of the sketch
336
- */
337
- static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
279
+ virtual iterator begin();
280
+ virtual iterator end();
281
+ virtual const_iterator begin() const;
282
+ virtual const_iterator end() const;
338
283
 
339
284
  private:
340
- // resize threshold = 0.5 tuned for speed
341
- static constexpr double RESIZE_THRESHOLD = 0.5;
342
- // hash table rebuild threshold = 15/16
343
- static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
344
-
345
- static constexpr uint8_t STRIDE_HASH_BITS = 7;
346
- static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
347
-
348
- uint8_t lg_cur_size_;
349
- uint8_t lg_nom_size_;
350
- vector_u64<A> keys_;
351
- uint32_t num_keys_;
352
- resize_factor rf_;
353
- float p_;
354
- uint64_t seed_;
355
- uint32_t capacity_;
285
+ theta_table table_;
356
286
 
357
287
  // for builder
358
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
359
-
360
- // for deserialize
361
- update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
288
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
289
+ uint64_t seed, const Allocator& allocator);
362
290
 
363
- void resize();
364
- void rebuild();
365
-
366
- friend theta_union_alloc<A>;
367
- void internal_update(uint64_t hash);
368
-
369
- friend theta_intersection_alloc<A>;
370
- friend theta_a_not_b_alloc<A>;
371
- static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
372
- static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
373
- static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
374
- static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
375
-
376
- friend theta_sketch_alloc<A>;
377
- static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
378
- static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
291
+ using ostrstream = typename Base::ostrstream;
292
+ virtual void print_specifics(ostrstream& os) const;
379
293
  };
380
294
 
381
295
  // compact sketch
382
296
 
383
- template<typename A>
384
- class compact_theta_sketch_alloc: public theta_sketch_alloc<A> {
297
+ template<typename Allocator = std::allocator<uint64_t>>
298
+ class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
385
299
  public:
300
+ using Base = theta_sketch_alloc<Allocator>;
301
+ using iterator = typename Base::iterator;
302
+ using const_iterator = typename Base::const_iterator;
303
+ using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
304
+ using vector_bytes = std::vector<uint8_t, AllocBytes>;
305
+
306
+ static const uint8_t SERIAL_VERSION = 3;
386
307
  static const uint8_t SKETCH_TYPE = 3;
387
308
 
388
- // No constructor here.
389
309
  // Instances of this type can be obtained:
390
- // - by compacting an update_theta_sketch
310
+ // - by compacting an update_theta_sketch_alloc
391
311
  // - as a result of a set operation
392
312
  // - by deserializing a previously serialized compact sketch
393
313
 
394
- compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered);
314
+ compact_theta_sketch_alloc(const Base& other, bool ordered);
315
+ compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
316
+ compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
395
317
  virtual ~compact_theta_sketch_alloc() = default;
318
+ compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc&) = default;
319
+ compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&&) = default;
396
320
 
321
+ virtual Allocator get_allocator() const;
322
+ virtual bool is_empty() const;
323
+ virtual bool is_ordered() const;
324
+ virtual uint64_t get_theta64() const;
397
325
  virtual uint32_t get_num_retained() const;
398
326
  virtual uint16_t get_seed_hash() const;
399
- virtual bool is_ordered() const;
400
- virtual string<A> to_string(bool print_items = false) const;
401
- virtual void serialize(std::ostream& os) const;
402
- typedef vector_u8<A> vector_bytes; // alias for users
403
- // header space is reserved, but not initialized
404
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
405
327
 
406
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
407
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
328
+ /**
329
+ * This method serializes the sketch into a given stream in a binary form
330
+ * @param os output stream
331
+ */
332
+ void serialize(std::ostream& os) const;
333
+
334
+ /**
335
+ * This method serializes the sketch as a vector of bytes.
336
+ * An optional header can be reserved in front of the sketch.
337
+ * It is an uninitialized space of a given size.
338
+ * This header is used in Datasketches PostgreSQL extension.
339
+ * @param header_size_bytes space to reserve in front of the sketch
340
+ */
341
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
342
+
343
+ virtual iterator begin();
344
+ virtual iterator end();
345
+ virtual const_iterator begin() const;
346
+ virtual const_iterator end() const;
408
347
 
409
348
  /**
410
349
  * This method deserializes a sketch from a given stream.
411
350
  * @param is input stream
412
351
  * @param seed the seed for the hash function that was used to create the sketch
413
- * @return an instance of a sketch
352
+ * @return an instance of the sketch
414
353
  */
415
- static compact_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
354
+ static compact_theta_sketch_alloc deserialize(std::istream& is,
355
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
416
356
 
417
357
  /**
418
358
  * This method deserializes a sketch from a given array of bytes.
@@ -421,110 +361,36 @@ public:
421
361
  * @param seed the seed for the hash function that was used to create the sketch
422
362
  * @return an instance of the sketch
423
363
  */
424
- static compact_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
364
+ static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
365
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
366
+
367
+ // for internal use
368
+ compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
425
369
 
426
370
  private:
427
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
371
+ enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
428
372
 
429
- vector_u64<A> keys_;
430
- uint16_t seed_hash_;
373
+ bool is_empty_;
431
374
  bool is_ordered_;
375
+ uint16_t seed_hash_;
376
+ uint64_t theta_;
377
+ std::vector<uint64_t, Allocator> entries_;
432
378
 
433
- friend theta_sketch_alloc<A>;
434
- friend update_theta_sketch_alloc<A>;
435
- friend theta_union_alloc<A>;
436
- friend theta_intersection_alloc<A>;
437
- friend theta_a_not_b_alloc<A>;
438
- compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
439
- static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
440
- static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
441
- };
442
-
443
- // builder
444
-
445
- template<typename A>
446
- class update_theta_sketch_alloc<A>::builder {
447
- public:
448
- static const uint8_t MIN_LG_K = 5;
449
- static const uint8_t DEFAULT_LG_K = 12;
450
- static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
451
-
452
- /**
453
- * Creates and instance of the builder with default parameters.
454
- */
455
- builder();
456
-
457
- /**
458
- * Set log2(k), where k is a nominal number of entries in the sketch
459
- * @param lg_k base 2 logarithm of nominal number of entries
460
- * @return this builder
461
- */
462
- builder& set_lg_k(uint8_t lg_k);
463
-
464
- /**
465
- * Set resize factor for the internal hash table (defaults to 8)
466
- * @param rf resize factor
467
- * @return this builder
468
- */
469
- builder& set_resize_factor(resize_factor rf);
470
-
471
- /**
472
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
473
- * all entries until it reaches the limit, at which point it goes into the estimation mode
474
- * and reduces the effective sampling probability (theta) as necessary.
475
- * @param p sampling probability
476
- * @return this builder
477
- */
478
- builder& set_p(float p);
479
-
480
- /**
481
- * Set the seed for the hash function. Should be used carefully if needed.
482
- * Sketches produced with different seed are not compatible
483
- * and cannot be mixed in set operations.
484
- * @param seed hash seed
485
- * @return this builder
486
- */
487
- builder& set_seed(uint64_t seed);
488
-
489
- /**
490
- * This is to create an instance of the sketch with predefined parameters.
491
- * @return and instance of the sketch
492
- */
493
- update_theta_sketch_alloc<A> build() const;
494
-
495
- private:
496
- uint8_t lg_k_;
497
- resize_factor rf_;
498
- float p_;
499
- uint64_t seed_;
500
-
501
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
379
+ using ostrstream = typename Base::ostrstream;
380
+ virtual void print_specifics(ostrstream& os) const;
502
381
  };
503
382
 
504
- // iterator
505
- template<typename A>
506
- class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
383
+ template<typename Allocator>
384
+ class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
507
385
  public:
508
- const_iterator& operator++();
509
- const_iterator operator++(int);
510
- bool operator==(const const_iterator& other) const;
511
- bool operator!=(const const_iterator& other) const;
512
- uint64_t operator*() const;
513
-
514
- private:
515
- const uint64_t* keys_;
516
- uint32_t size_;
517
- uint32_t index_;
518
- const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
519
- friend class update_theta_sketch_alloc<A>;
520
- friend class compact_theta_sketch_alloc<A>;
386
+ builder(const Allocator& allocator = Allocator());
387
+ update_theta_sketch_alloc build() const;
521
388
  };
522
389
 
523
-
524
390
  // aliases with default allocator for convenience
525
- typedef theta_sketch_alloc<std::allocator<void>> theta_sketch;
526
- typedef update_theta_sketch_alloc<std::allocator<void>> update_theta_sketch;
527
- typedef compact_theta_sketch_alloc<std::allocator<void>> compact_theta_sketch;
391
+ using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
392
+ using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
393
+ using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
528
394
 
529
395
  } /* namespace datasketches */
530
396