datasketches 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -20,103 +20,70 @@
20
20
  #ifndef THETA_UNION_HPP_
21
21
  #define THETA_UNION_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
-
23
+ #include "serde.hpp"
27
24
  #include "theta_sketch.hpp"
25
+ #include "theta_union_base.hpp"
28
26
 
29
27
  namespace datasketches {
30
28
 
31
- /*
32
- * author Alexander Saydakov
33
- * author Lee Rhodes
34
- * author Kevin Lang
35
- */
36
-
37
- template<typename A>
29
+ template<typename Allocator = std::allocator<uint64_t>>
38
30
  class theta_union_alloc {
39
31
  public:
40
- class builder;
32
+ using Entry = uint64_t;
33
+ using ExtractKey = trivial_extract_key;
34
+ using Sketch = theta_sketch_alloc<Allocator>;
35
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
36
+ using resize_factor = theta_constants::resize_factor;
37
+
38
+ struct pass_through_policy {
39
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
40
+ unused(incoming_entry);
41
+ return internal_entry;
42
+ }
43
+ };
44
+ using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
41
45
 
42
46
  // No constructor here. Use builder instead.
47
+ class builder;
43
48
 
44
49
  /**
45
50
  * This method is to update the union with a given sketch
46
51
  * @param sketch to update the union with
47
52
  */
48
- void update(const theta_sketch_alloc<A>& sketch);
53
+ template<typename FwdSketch>
54
+ void update(FwdSketch&& sketch);
49
55
 
50
56
  /**
51
57
  * This method produces a copy of the current state of the union as a compact sketch.
52
58
  * @param ordered optional flag to specify if ordered sketch should be produced
53
59
  * @return the result of the union
54
60
  */
55
- compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
61
+ CompactSketch get_result(bool ordered = true) const;
56
62
 
57
63
  private:
58
- bool is_empty_;
59
- uint64_t theta_;
60
- update_theta_sketch_alloc<A> state_;
64
+ State state_;
61
65
 
62
66
  // for builder
63
- theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state);
67
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
64
68
  };
65
69
 
66
- // builder
67
-
68
70
  template<typename A>
69
- class theta_union_alloc<A>::builder {
71
+ class theta_union_alloc<A>::builder: public theta_base_builder<builder, A> {
70
72
  public:
71
- typedef typename update_theta_sketch_alloc<A>::resize_factor resize_factor;
72
-
73
- /**
74
- * Set log2(k), where k is a nominal number of entries in the sketch
75
- * @param lg_k base 2 logarithm of nominal number of entries
76
- * @return this builder
77
- */
78
- builder& set_lg_k(uint8_t lg_k);
79
-
80
- /**
81
- * Set resize factor for the internal hash table (defaults to 8)
82
- * @param rf resize factor
83
- * @return this builder
84
- */
85
- builder& set_resize_factor(resize_factor rf);
86
-
87
- /**
88
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
89
- * all entries until it reaches the limit, at which point it goes into the estimation mode
90
- * and reduces the effective sampling probability (theta) as necessary.
91
- * @param p sampling probability
92
- * @return this builder
93
- */
94
- builder& set_p(float p);
95
-
96
- /**
97
- * Set the seed for the hash function. Should be used carefully if needed.
98
- * Sketches produced with different seed are not compatible
99
- * and cannot be mixed in set operations.
100
- * @param seed hash seed
101
- * @return this builder
102
- */
103
- builder& set_seed(uint64_t seed);
73
+ builder(const A& allocator = A());
104
74
 
105
75
  /**
106
76
  * This is to create an instance of the union with predefined parameters.
107
- * @return and instance of the union
77
+ * @return an instance of the union
108
78
  */
109
79
  theta_union_alloc<A> build() const;
110
-
111
- private:
112
- typename update_theta_sketch_alloc<A>::builder sketch_builder;
113
80
  };
114
81
 
115
82
  // alias with default allocator for convenience
116
- typedef theta_union_alloc<std::allocator<void>> theta_union;
83
+ using theta_union = theta_union_alloc<std::allocator<uint64_t>>;
117
84
 
118
85
  } /* namespace datasketches */
119
86
 
120
87
  #include "theta_union_impl.hpp"
121
88
 
122
- # endif
89
+ #endif
@@ -30,7 +30,7 @@ template<
30
30
  typename Policy,
31
31
  typename Sketch,
32
32
  typename CompactSketch,
33
- typename Allocator = std::allocator<Entry>
33
+ typename Allocator
34
34
  >
35
35
  class theta_union_base {
36
36
  public:
@@ -17,6 +17,9 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+ #ifndef THETA_UNION_BASE_IMPL_HPP_
21
+ #define THETA_UNION_BASE_IMPL_HPP_
22
+
20
23
  #include <algorithm>
21
24
 
22
25
  #include "conditional_forward.hpp"
@@ -82,3 +85,5 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
82
85
  }
83
86
 
84
87
  } /* namespace datasketches */
88
+
89
+ #endif
@@ -22,86 +22,30 @@
22
22
 
23
23
  namespace datasketches {
24
24
 
25
- /*
26
- * author Alexander Saydakov
27
- * author Lee Rhodes
28
- * author Kevin Lang
29
- */
30
-
31
- template<typename A>
32
- theta_union_alloc<A>::theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state):
33
- is_empty_(true), theta_(theta), state_(std::move(state)) {}
34
-
35
- template<typename A>
36
- void theta_union_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
37
- if (sketch.is_empty()) return;
38
- if (sketch.get_seed_hash() != state_.get_seed_hash()) throw std::invalid_argument("seed hash mismatch");
39
- is_empty_ = false;
40
- if (sketch.get_theta64() < theta_) theta_ = sketch.get_theta64();
41
- if (sketch.is_ordered()) {
42
- for (auto hash: sketch) {
43
- if (hash >= theta_) break; // early stop
44
- state_.internal_update(hash);
45
- }
46
- } else {
47
- for (auto hash: sketch) if (hash < theta_) state_.internal_update(hash);
48
- }
49
- if (state_.get_theta64() < theta_) theta_ = state_.get_theta64();
50
- }
51
-
52
25
  template<typename A>
53
- compact_theta_sketch_alloc<A> theta_union_alloc<A>::get_result(bool ordered) const {
54
- if (is_empty_) return state_.compact(ordered);
55
- const uint32_t nom_num_keys = 1 << state_.lg_nom_size_;
56
- if (theta_ >= state_.theta_ && state_.get_num_retained() <= nom_num_keys) return state_.compact(ordered);
57
- uint64_t theta = std::min(theta_, state_.get_theta64());
58
- vector_u64<A> keys(state_.get_num_retained());
59
- uint32_t num_keys = 0;
60
- for (auto key: state_) {
61
- if (key < theta) keys[num_keys++] = key;
62
- }
63
- if (num_keys > nom_num_keys) {
64
- std::nth_element(keys.begin(), keys.begin() + nom_num_keys, keys.begin() + num_keys);
65
- theta = keys[nom_num_keys];
66
- num_keys = nom_num_keys;
67
- }
68
- if (num_keys != state_.get_num_retained()) {
69
- keys.resize(num_keys);
70
- }
71
- if (ordered) std::sort(keys.begin(), keys.end());
72
- return compact_theta_sketch_alloc<A>(false, theta, std::move(keys), state_.get_seed_hash(), ordered);
73
- }
74
-
75
- // builder
26
+ theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
27
+ state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
28
+ {}
76
29
 
77
30
  template<typename A>
78
- typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
79
- sketch_builder.set_lg_k(lg_k);
80
- return *this;
31
+ template<typename SS>
32
+ void theta_union_alloc<A>::update(SS&& sketch) {
33
+ state_.update(std::forward<SS>(sketch));
81
34
  }
82
35
 
83
36
  template<typename A>
84
- typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_resize_factor(resize_factor rf) {
85
- sketch_builder.set_resize_factor(rf);
86
- return *this;
37
+ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
38
+ return state_.get_result(ordered);
87
39
  }
88
40
 
89
41
  template<typename A>
90
- typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_p(float p) {
91
- sketch_builder.set_p(p);
92
- return *this;
93
- }
94
-
95
- template<typename A>
96
- typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_seed(uint64_t seed) {
97
- sketch_builder.set_seed(seed);
98
- return *this;
99
- }
42
+ theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
100
43
 
101
44
  template<typename A>
102
- theta_union_alloc<A> theta_union_alloc<A>::builder::build() const {
103
- update_theta_sketch_alloc<A> sketch = sketch_builder.build();
104
- return theta_union_alloc(sketch.get_theta64(), std::move(sketch));
45
+ auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
46
+ return theta_union_alloc(
47
+ this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
48
+ this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
105
49
  }
106
50
 
107
51
  } /* namespace datasketches */
@@ -34,7 +34,7 @@ namespace datasketches {
34
34
  template<
35
35
  typename Entry,
36
36
  typename ExtractKey,
37
- typename Allocator = std::allocator<Entry>
37
+ typename Allocator
38
38
  >
39
39
  struct theta_update_sketch_base {
40
40
  using resize_factor = theta_constants::resize_factor;
@@ -147,7 +147,7 @@ protected:
147
147
  static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
148
148
  };
149
149
 
150
- // key extractors
150
+ // key extractor
151
151
 
152
152
  struct trivial_extract_key {
153
153
  template<typename T>
@@ -156,17 +156,7 @@ struct trivial_extract_key {
156
156
  }
157
157
  };
158
158
 
159
- template<typename K, typename V>
160
- struct pair_extract_key {
161
- K& operator()(std::pair<K, V>& entry) const {
162
- return entry.first;
163
- }
164
- const K& operator()(const std::pair<K, V>& entry) const {
165
- return entry.first;
166
- }
167
- };
168
-
169
- // not zero
159
+ // key not zero
170
160
 
171
161
  template<typename Entry, typename ExtractKey>
172
162
  class key_not_zero {
@@ -195,12 +185,6 @@ static inline uint64_t compute_hash(const void* data, size_t length, uint64_t se
195
185
  return (hashes.h1 >> 1); // Java implementation does unsigned shift >>> to make values positive
196
186
  }
197
187
 
198
- static inline uint16_t compute_seed_hash(uint64_t seed) {
199
- HashState hashes;
200
- MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
201
- return hashes.h1;
202
- }
203
-
204
188
  // iterators
205
189
 
206
190
  template<typename Entry, typename ExtractKey>
@@ -17,6 +17,9 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+ #ifndef THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
21
+ #define THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
22
+
20
23
  #include <iostream>
21
24
  #include <sstream>
22
25
  #include <algorithm>
@@ -69,7 +72,7 @@ entries_(nullptr)
69
72
 
70
73
  template<typename EN, typename EK, typename A>
71
74
  theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(theta_update_sketch_base&& other) noexcept:
72
- allocator_(other.allocator_),
75
+ allocator_(std::move(other.allocator_)),
73
76
  is_empty_(other.is_empty_),
74
77
  lg_cur_size_(other.lg_cur_size_),
75
78
  lg_nom_size_(other.lg_nom_size_),
@@ -387,3 +390,5 @@ auto theta_const_iterator<Entry, ExtractKey>::operator*() const -> const Entry&
387
390
  }
388
391
 
389
392
  } /* namespace datasketches */
393
+
394
+ #endif
@@ -42,4 +42,5 @@ target_sources(theta_test
42
42
  theta_union_test.cpp
43
43
  theta_intersection_test.cpp
44
44
  theta_a_not_b_test.cpp
45
+ theta_jaccard_similarity_test.cpp
45
46
  )
@@ -20,11 +20,10 @@
20
20
  #include <iostream>
21
21
 
22
22
  #include <catch.hpp>
23
- #include <jaccard_similarity.hpp>
24
23
 
25
- namespace datasketches {
24
+ #include "theta_jaccard_similarity.hpp"
26
25
 
27
- using update_theta_sketch = update_theta_sketch_experimental<>;
26
+ namespace datasketches {
28
27
 
29
28
  TEST_CASE("theta jaccard: empty", "[theta_sketch]") {
30
29
  auto sk_a = update_theta_sketch::builder().build();
@@ -17,10 +17,10 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #include <catch.hpp>
21
20
  #include <fstream>
22
21
  #include <sstream>
23
22
 
23
+ #include <catch.hpp>
24
24
  #include <theta_sketch.hpp>
25
25
 
26
26
  namespace datasketches {
@@ -134,75 +134,7 @@ TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
134
134
  REQUIRE(compact_sketch.get_upper_bound(1) > n);
135
135
  }
136
136
 
137
- TEST_CASE("theta sketch: deserialize update empty from java as base", "[theta_sketch]") {
138
- std::ifstream is;
139
- is.exceptions(std::ios::failbit | std::ios::badbit);
140
- is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
141
- auto sketchptr = theta_sketch::deserialize(is);
142
- REQUIRE(sketchptr->is_empty());
143
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
144
- REQUIRE(sketchptr->get_num_retained() == 0);
145
- REQUIRE(sketchptr->get_theta() == 1.0);
146
- REQUIRE(sketchptr->get_estimate() == 0.0);
147
- REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
148
- REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
149
- }
150
-
151
- TEST_CASE("theta sketch: deserialize update empty from java as subclass", "[theta_sketch]") {
152
- std::ifstream is;
153
- is.exceptions(std::ios::failbit | std::ios::badbit);
154
- is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
155
- auto sketch = update_theta_sketch::deserialize(is);
156
- REQUIRE(sketch.is_empty());
157
- REQUIRE_FALSE(sketch.is_estimation_mode());
158
- REQUIRE(sketch.get_num_retained() == 0);
159
- REQUIRE(sketch.get_theta() == 1.0);
160
- REQUIRE(sketch.get_estimate() == 0.0);
161
- REQUIRE(sketch.get_lower_bound(1) == 0.0);
162
- REQUIRE(sketch.get_upper_bound(1) == 0.0);
163
- }
164
-
165
- TEST_CASE("theta sketch: deserialize update estimation from java as base", "[theta_sketch]") {
166
- std::ifstream is;
167
- is.exceptions(std::ios::failbit | std::ios::badbit);
168
- is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
169
- auto sketchptr = theta_sketch::deserialize(is);
170
- REQUIRE_FALSE(sketchptr->is_empty());
171
- REQUIRE(sketchptr->is_estimation_mode());
172
- REQUIRE(sketchptr->get_num_retained() == 5324);
173
- REQUIRE(sketchptr->get_estimate() == Approx(10000.0).margin(10000 * 0.01));
174
- REQUIRE(sketchptr->get_lower_bound(1) < 10000);
175
- REQUIRE(sketchptr->get_upper_bound(1) > 10000);
176
- }
177
-
178
- TEST_CASE("theta sketch: deserialize update estimation from java as subclass", "[theta_sketch]") {
179
- std::ifstream is;
180
- is.exceptions(std::ios::failbit | std::ios::badbit);
181
- is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
182
- auto sketch = update_theta_sketch::deserialize(is);
183
- REQUIRE_FALSE(sketch.is_empty());
184
- REQUIRE(sketch.is_estimation_mode());
185
- REQUIRE(sketch.get_num_retained() == 5324);
186
- REQUIRE(sketch.get_estimate() == Approx(10000.0).margin(10000 * 0.01));
187
- REQUIRE(sketch.get_lower_bound(1) < 10000);
188
- REQUIRE(sketch.get_upper_bound(1) > 10000);
189
- }
190
-
191
- TEST_CASE("theta sketch: deserialize compact empty from java as base", "[theta_sketch]") {
192
- std::ifstream is;
193
- is.exceptions(std::ios::failbit | std::ios::badbit);
194
- is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
195
- auto sketchptr = theta_sketch::deserialize(is);
196
- REQUIRE(sketchptr->is_empty());
197
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
198
- REQUIRE(sketchptr->get_num_retained() == 0);
199
- REQUIRE(sketchptr->get_theta() == 1.0);
200
- REQUIRE(sketchptr->get_estimate() == 0.0);
201
- REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
202
- REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
203
- }
204
-
205
- TEST_CASE("theta sketch: deserialize compact empty from java as subclass", "[theta_sketch]") {
137
+ TEST_CASE("theta sketch: deserialize compact empty from java", "[theta_sketch]") {
206
138
  std::ifstream is;
207
139
  is.exceptions(std::ios::failbit | std::ios::badbit);
208
140
  is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
@@ -216,21 +148,7 @@ TEST_CASE("theta sketch: deserialize compact empty from java as subclass", "[the
216
148
  REQUIRE(sketch.get_upper_bound(1) == 0.0);
217
149
  }
218
150
 
219
- TEST_CASE("theta sketch: deserialize single item from java as base", "[theta_sketch]") {
220
- std::ifstream is;
221
- is.exceptions(std::ios::failbit | std::ios::badbit);
222
- is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
223
- auto sketchptr = theta_sketch::deserialize(is);
224
- REQUIRE_FALSE(sketchptr->is_empty());
225
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
226
- REQUIRE(sketchptr->get_num_retained() == 1);
227
- REQUIRE(sketchptr->get_theta() == 1.0);
228
- REQUIRE(sketchptr->get_estimate() == 1.0);
229
- REQUIRE(sketchptr->get_lower_bound(1) == 1.0);
230
- REQUIRE(sketchptr->get_upper_bound(1) == 1.0);
231
- }
232
-
233
- TEST_CASE("theta sketch: deserialize single item from java as subclass", "[theta_sketch]") {
151
+ TEST_CASE("theta sketch: deserialize single item from java", "[theta_sketch]") {
234
152
  std::ifstream is;
235
153
  is.exceptions(std::ios::failbit | std::ios::badbit);
236
154
  is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
@@ -244,55 +162,21 @@ TEST_CASE("theta sketch: deserialize single item from java as subclass", "[theta
244
162
  REQUIRE(sketch.get_upper_bound(1) == 1.0);
245
163
  }
246
164
 
247
- TEST_CASE("theta sketch: deserialize compact estimation from java as base", "[theta_sketch]") {
248
- std::ifstream is;
249
- is.exceptions(std::ios::failbit | std::ios::badbit);
250
- is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
251
- auto sketchptr = theta_sketch::deserialize(is);
252
- REQUIRE_FALSE(sketchptr->is_empty());
253
- REQUIRE(sketchptr->is_estimation_mode());
254
- REQUIRE(sketchptr->is_ordered());
255
- REQUIRE(sketchptr->get_num_retained() == 4342);
256
- REQUIRE(sketchptr->get_theta() == Approx(0.531700444213199).margin(1e-10));
257
- REQUIRE(sketchptr->get_estimate() == Approx(8166.25234614053).margin(1e-10));
258
- REQUIRE(sketchptr->get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
259
- REQUIRE(sketchptr->get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
260
-
261
- // the same construction process in Java must have produced exactly the same sketch
262
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
263
- const int n = 8192;
264
- for (int i = 0; i < n; i++) update_sketch.update(i);
265
- REQUIRE(sketchptr->get_num_retained() == update_sketch.get_num_retained());
266
- REQUIRE(sketchptr->get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
267
- REQUIRE(sketchptr->get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
268
- REQUIRE(sketchptr->get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
269
- REQUIRE(sketchptr->get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
270
- REQUIRE(sketchptr->get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
271
- REQUIRE(sketchptr->get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
272
- REQUIRE(sketchptr->get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
273
- REQUIRE(sketchptr->get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
274
- compact_theta_sketch compact_sketch = update_sketch.compact();
275
- // the sketches are ordered, so the iteration sequence must match exactly
276
- auto iter = sketchptr->begin();
277
- for (auto key: compact_sketch) {
278
- REQUIRE(*iter == key);
279
- ++iter;
280
- }
281
- }
282
-
283
- TEST_CASE("theta sketch: deserialize compact estimation from java as subclass", "[theta_sketch]") {
165
+ TEST_CASE("theta sketch: deserialize compact estimation from java", "[theta_sketch]") {
284
166
  std::ifstream is;
285
167
  is.exceptions(std::ios::failbit | std::ios::badbit);
286
168
  is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
287
169
  auto sketch = compact_theta_sketch::deserialize(is);
288
170
  REQUIRE_FALSE(sketch.is_empty());
289
171
  REQUIRE(sketch.is_estimation_mode());
172
+ REQUIRE(sketch.is_ordered());
290
173
  REQUIRE(sketch.get_num_retained() == 4342);
291
174
  REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
292
175
  REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
293
176
  REQUIRE(sketch.get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
294
177
  REQUIRE(sketch.get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
295
178
 
179
+ // the same construction process in Java must have produced exactly the same sketch
296
180
  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
297
181
  const int n = 8192;
298
182
  for (int i = 0; i < n; i++) update_sketch.update(i);
@@ -305,132 +189,51 @@ TEST_CASE("theta sketch: deserialize compact estimation from java as subclass",
305
189
  REQUIRE(sketch.get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
306
190
  REQUIRE(sketch.get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
307
191
  REQUIRE(sketch.get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
192
+ compact_theta_sketch compact_sketch = update_sketch.compact();
193
+ // the sketches are ordered, so the iteration sequence must match exactly
194
+ auto iter = sketch.begin();
195
+ for (const auto& key: compact_sketch) {
196
+ REQUIRE(*iter == key);
197
+ ++iter;
198
+ }
308
199
  }
309
200
 
310
- TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalency", "[theta_sketch]") {
201
+ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[theta_sketch]") {
311
202
  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
312
203
  const int n = 8192;
313
204
  for (int i = 0; i < n; i++) update_sketch.update(i);
314
205
 
315
- // update sketch stream and bytes comparison
316
- {
317
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
318
- update_sketch.serialize(s);
319
- auto bytes = update_sketch.serialize();
320
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
321
- for (size_t i = 0; i < bytes.size(); ++i) {
322
- REQUIRE(((char*)bytes.data())[i] == (char)s.get());
323
- }
324
-
325
- // deserialize as base class
326
- {
327
- s.seekg(0); // rewind
328
- auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
329
- auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
330
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
331
- REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
332
- REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
333
- REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
334
- REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
335
- REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
336
- REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
337
- REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
338
- // hash tables must be identical since they are restored from dumps, and iteration is deterministic
339
- auto iter = deserialized_sketch_ptr1->begin();
340
- for (auto key: *deserialized_sketch_ptr2) {
341
- REQUIRE(*iter == key);
342
- ++iter;
343
- }
344
- }
345
-
346
- // deserialize as subclass
347
- {
348
- s.seekg(0); // rewind
349
- update_theta_sketch deserialized_sketch1 = update_theta_sketch::deserialize(s);
350
- update_theta_sketch deserialized_sketch2 = update_theta_sketch::deserialize(bytes.data(), bytes.size());
351
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
352
- REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
353
- REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
354
- REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
355
- REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
356
- REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
357
- REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
358
- REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
359
- // hash tables must be identical since they are restored from dumps, and iteration is deterministic
360
- auto iter = deserialized_sketch1.begin();
361
- for (auto key: deserialized_sketch2) {
362
- REQUIRE(*iter == key);
363
- ++iter;
364
- }
365
- }
206
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
207
+ update_sketch.compact().serialize(s);
208
+ auto bytes = update_sketch.compact().serialize();
209
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
210
+ for (size_t i = 0; i < bytes.size(); ++i) {
211
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
366
212
  }
367
213
 
368
- // compact sketch stream and bytes comparison
369
- {
370
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
371
- update_sketch.compact().serialize(s);
372
- auto bytes = update_sketch.compact().serialize();
373
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
374
- for (size_t i = 0; i < bytes.size(); ++i) {
375
- REQUIRE(((char*)bytes.data())[i] == (char)s.get());
376
- }
377
-
378
- // deserialize as base class
379
- {
380
- s.seekg(0); // rewind
381
- auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
382
- auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
383
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
384
- REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
385
- REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
386
- REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
387
- REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
388
- REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
389
- REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
390
- REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
391
- // the sketches are ordered, so the iteration sequence must match exactly
392
- auto iter = deserialized_sketch_ptr1->begin();
393
- for (auto key: *deserialized_sketch_ptr2) {
394
- REQUIRE(*iter == key);
395
- ++iter;
396
- }
397
- }
398
-
399
- // deserialize as subclass
400
- {
401
- s.seekg(0); // rewind
402
- compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
403
- compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
404
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
405
- REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
406
- REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
407
- REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
408
- REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
409
- REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
410
- REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
411
- REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
412
- // the sketches are ordered, so the iteration sequence must match exactly
413
- auto iter = deserialized_sketch1.begin();
414
- for (auto key: deserialized_sketch2) {
415
- REQUIRE(*iter == key);
416
- ++iter;
417
- }
418
- }
214
+ s.seekg(0); // rewind
215
+ compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
216
+ compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
217
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
218
+ REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
219
+ REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
220
+ REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
221
+ REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
222
+ REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
223
+ REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
224
+ REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
225
+ // the sketches are ordered, so the iteration sequence must match exactly
226
+ auto iter = deserialized_sketch1.begin();
227
+ for (auto key: deserialized_sketch2) {
228
+ REQUIRE(*iter == key);
229
+ ++iter;
419
230
  }
420
231
  }
421
232
 
422
- TEST_CASE("theta sketch: deserialize update single item buffer overrun", "[theta_sketch]") {
423
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
424
- update_sketch.update(1);
425
- theta_sketch::vector_bytes bytes = update_sketch.serialize();
426
- REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
427
- REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
428
- }
429
-
430
233
  TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") {
431
234
  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
432
235
  update_sketch.update(1);
433
- theta_sketch::vector_bytes bytes = update_sketch.compact().serialize();
236
+ auto bytes = update_sketch.compact().serialize();
434
237
  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
435
238
  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
436
239
  }