datasketches 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  4. data/ext/datasketches/ext.cpp +1 -1
  5. data/ext/datasketches/ext.h +4 -0
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/fi_wrapper.cpp +6 -8
  8. data/ext/datasketches/hll_wrapper.cpp +13 -14
  9. data/ext/datasketches/kll_wrapper.cpp +28 -76
  10. data/ext/datasketches/theta_wrapper.cpp +27 -41
  11. data/ext/datasketches/vo_wrapper.cpp +4 -6
  12. data/lib/datasketches/version.rb +1 -1
  13. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  14. data/vendor/datasketches-cpp/README.md +4 -4
  15. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
  16. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  17. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  18. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  19. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  20. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
  22. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
  24. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
  25. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  26. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
  27. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
  28. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
  29. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
  30. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
  31. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  33. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  34. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
  35. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  36. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  38. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
  41. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
  44. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
  45. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
  46. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
  47. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  48. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  49. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
  50. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
  51. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
  52. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
  53. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
  54. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
  55. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
  56. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
  57. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
  58. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
  59. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
  60. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
  62. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  63. data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
  64. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
  65. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
  66. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
  67. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
  68. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
  70. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
  71. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
  72. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  74. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
  75. data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
  76. data/vendor/datasketches-cpp/python/README.md +6 -3
  77. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  78. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
  79. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
  80. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  81. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
  82. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
  83. data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
  84. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  85. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  86. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  87. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
  88. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  89. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
  90. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  91. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  92. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  93. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  94. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  95. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  96. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  97. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  98. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  99. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  100. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  101. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
  104. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  105. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
  106. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  107. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  108. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
  109. data/vendor/datasketches-cpp/setup.py +5 -3
  110. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  111. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
  112. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  114. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  115. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  116. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
  117. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
  119. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  120. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  121. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  122. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  123. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
  124. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  125. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  126. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
  127. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
  128. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  129. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  130. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
  131. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  132. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
  133. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
  134. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  135. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
  136. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
  137. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  138. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
  141. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  142. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
  143. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
  144. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
  145. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
  146. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
  147. metadata +43 -34
  148. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  149. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  150. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  151. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  152. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  153. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  154. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  155. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  156. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  157. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  160. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -20,103 +20,70 @@
20
20
  #ifndef THETA_UNION_HPP_
21
21
  #define THETA_UNION_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
-
23
+ #include "serde.hpp"
27
24
  #include "theta_sketch.hpp"
25
+ #include "theta_union_base.hpp"
28
26
 
29
27
  namespace datasketches {
30
28
 
31
- /*
32
- * author Alexander Saydakov
33
- * author Lee Rhodes
34
- * author Kevin Lang
35
- */
36
-
37
- template<typename A>
29
+ template<typename Allocator = std::allocator<uint64_t>>
38
30
  class theta_union_alloc {
39
31
  public:
40
- class builder;
32
+ using Entry = uint64_t;
33
+ using ExtractKey = trivial_extract_key;
34
+ using Sketch = theta_sketch_alloc<Allocator>;
35
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
36
+ using resize_factor = theta_constants::resize_factor;
37
+
38
+ struct pass_through_policy {
39
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
40
+ unused(incoming_entry);
41
+ return internal_entry;
42
+ }
43
+ };
44
+ using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
41
45
 
42
46
  // No constructor here. Use builder instead.
47
+ class builder;
43
48
 
44
49
  /**
45
50
  * This method is to update the union with a given sketch
46
51
  * @param sketch to update the union with
47
52
  */
48
- void update(const theta_sketch_alloc<A>& sketch);
53
+ template<typename FwdSketch>
54
+ void update(FwdSketch&& sketch);
49
55
 
50
56
  /**
51
57
  * This method produces a copy of the current state of the union as a compact sketch.
52
58
  * @param ordered optional flag to specify if ordered sketch should be produced
53
59
  * @return the result of the union
54
60
  */
55
- compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
61
+ CompactSketch get_result(bool ordered = true) const;
56
62
 
57
63
  private:
58
- bool is_empty_;
59
- uint64_t theta_;
60
- update_theta_sketch_alloc<A> state_;
64
+ State state_;
61
65
 
62
66
  // for builder
63
- theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state);
67
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
64
68
  };
65
69
 
66
- // builder
67
-
68
70
  template<typename A>
69
- class theta_union_alloc<A>::builder {
71
+ class theta_union_alloc<A>::builder: public theta_base_builder<builder, A> {
70
72
  public:
71
- typedef typename update_theta_sketch_alloc<A>::resize_factor resize_factor;
72
-
73
- /**
74
- * Set log2(k), where k is a nominal number of entries in the sketch
75
- * @param lg_k base 2 logarithm of nominal number of entries
76
- * @return this builder
77
- */
78
- builder& set_lg_k(uint8_t lg_k);
79
-
80
- /**
81
- * Set resize factor for the internal hash table (defaults to 8)
82
- * @param rf resize factor
83
- * @return this builder
84
- */
85
- builder& set_resize_factor(resize_factor rf);
86
-
87
- /**
88
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
89
- * all entries until it reaches the limit, at which point it goes into the estimation mode
90
- * and reduces the effective sampling probability (theta) as necessary.
91
- * @param p sampling probability
92
- * @return this builder
93
- */
94
- builder& set_p(float p);
95
-
96
- /**
97
- * Set the seed for the hash function. Should be used carefully if needed.
98
- * Sketches produced with different seed are not compatible
99
- * and cannot be mixed in set operations.
100
- * @param seed hash seed
101
- * @return this builder
102
- */
103
- builder& set_seed(uint64_t seed);
73
+ builder(const A& allocator = A());
104
74
 
105
75
  /**
106
76
  * This is to create an instance of the union with predefined parameters.
107
- * @return and instance of the union
77
+ * @return an instance of the union
108
78
  */
109
79
  theta_union_alloc<A> build() const;
110
-
111
- private:
112
- typename update_theta_sketch_alloc<A>::builder sketch_builder;
113
80
  };
114
81
 
115
82
  // alias with default allocator for convenience
116
- typedef theta_union_alloc<std::allocator<void>> theta_union;
83
+ using theta_union = theta_union_alloc<std::allocator<uint64_t>>;
117
84
 
118
85
  } /* namespace datasketches */
119
86
 
120
87
  #include "theta_union_impl.hpp"
121
88
 
122
- # endif
89
+ #endif
@@ -30,7 +30,7 @@ template<
30
30
  typename Policy,
31
31
  typename Sketch,
32
32
  typename CompactSketch,
33
- typename Allocator = std::allocator<Entry>
33
+ typename Allocator
34
34
  >
35
35
  class theta_union_base {
36
36
  public:
@@ -17,6 +17,9 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+ #ifndef THETA_UNION_BASE_IMPL_HPP_
21
+ #define THETA_UNION_BASE_IMPL_HPP_
22
+
20
23
  #include <algorithm>
21
24
 
22
25
  #include "conditional_forward.hpp"
@@ -82,3 +85,5 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
82
85
  }
83
86
 
84
87
  } /* namespace datasketches */
88
+
89
+ #endif
@@ -22,86 +22,30 @@
22
22
 
23
23
  namespace datasketches {
24
24
 
25
- /*
26
- * author Alexander Saydakov
27
- * author Lee Rhodes
28
- * author Kevin Lang
29
- */
30
-
31
- template<typename A>
32
- theta_union_alloc<A>::theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state):
33
- is_empty_(true), theta_(theta), state_(std::move(state)) {}
34
-
35
- template<typename A>
36
- void theta_union_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
37
- if (sketch.is_empty()) return;
38
- if (sketch.get_seed_hash() != state_.get_seed_hash()) throw std::invalid_argument("seed hash mismatch");
39
- is_empty_ = false;
40
- if (sketch.get_theta64() < theta_) theta_ = sketch.get_theta64();
41
- if (sketch.is_ordered()) {
42
- for (auto hash: sketch) {
43
- if (hash >= theta_) break; // early stop
44
- state_.internal_update(hash);
45
- }
46
- } else {
47
- for (auto hash: sketch) if (hash < theta_) state_.internal_update(hash);
48
- }
49
- if (state_.get_theta64() < theta_) theta_ = state_.get_theta64();
50
- }
51
-
52
25
  template<typename A>
53
- compact_theta_sketch_alloc<A> theta_union_alloc<A>::get_result(bool ordered) const {
54
- if (is_empty_) return state_.compact(ordered);
55
- const uint32_t nom_num_keys = 1 << state_.lg_nom_size_;
56
- if (theta_ >= state_.theta_ && state_.get_num_retained() <= nom_num_keys) return state_.compact(ordered);
57
- uint64_t theta = std::min(theta_, state_.get_theta64());
58
- vector_u64<A> keys(state_.get_num_retained());
59
- uint32_t num_keys = 0;
60
- for (auto key: state_) {
61
- if (key < theta) keys[num_keys++] = key;
62
- }
63
- if (num_keys > nom_num_keys) {
64
- std::nth_element(keys.begin(), keys.begin() + nom_num_keys, keys.begin() + num_keys);
65
- theta = keys[nom_num_keys];
66
- num_keys = nom_num_keys;
67
- }
68
- if (num_keys != state_.get_num_retained()) {
69
- keys.resize(num_keys);
70
- }
71
- if (ordered) std::sort(keys.begin(), keys.end());
72
- return compact_theta_sketch_alloc<A>(false, theta, std::move(keys), state_.get_seed_hash(), ordered);
73
- }
74
-
75
- // builder
26
+ theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
27
+ state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
28
+ {}
76
29
 
77
30
  template<typename A>
78
- typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
79
- sketch_builder.set_lg_k(lg_k);
80
- return *this;
31
+ template<typename SS>
32
+ void theta_union_alloc<A>::update(SS&& sketch) {
33
+ state_.update(std::forward<SS>(sketch));
81
34
  }
82
35
 
83
36
  template<typename A>
84
- typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_resize_factor(resize_factor rf) {
85
- sketch_builder.set_resize_factor(rf);
86
- return *this;
37
+ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
38
+ return state_.get_result(ordered);
87
39
  }
88
40
 
89
41
  template<typename A>
90
- typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_p(float p) {
91
- sketch_builder.set_p(p);
92
- return *this;
93
- }
94
-
95
- template<typename A>
96
- typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_seed(uint64_t seed) {
97
- sketch_builder.set_seed(seed);
98
- return *this;
99
- }
42
+ theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
100
43
 
101
44
  template<typename A>
102
- theta_union_alloc<A> theta_union_alloc<A>::builder::build() const {
103
- update_theta_sketch_alloc<A> sketch = sketch_builder.build();
104
- return theta_union_alloc(sketch.get_theta64(), std::move(sketch));
45
+ auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
46
+ return theta_union_alloc(
47
+ this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
48
+ this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
105
49
  }
106
50
 
107
51
  } /* namespace datasketches */
@@ -34,7 +34,7 @@ namespace datasketches {
34
34
  template<
35
35
  typename Entry,
36
36
  typename ExtractKey,
37
- typename Allocator = std::allocator<Entry>
37
+ typename Allocator
38
38
  >
39
39
  struct theta_update_sketch_base {
40
40
  using resize_factor = theta_constants::resize_factor;
@@ -147,7 +147,7 @@ protected:
147
147
  static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
148
148
  };
149
149
 
150
- // key extractors
150
+ // key extractor
151
151
 
152
152
  struct trivial_extract_key {
153
153
  template<typename T>
@@ -156,17 +156,7 @@ struct trivial_extract_key {
156
156
  }
157
157
  };
158
158
 
159
- template<typename K, typename V>
160
- struct pair_extract_key {
161
- K& operator()(std::pair<K, V>& entry) const {
162
- return entry.first;
163
- }
164
- const K& operator()(const std::pair<K, V>& entry) const {
165
- return entry.first;
166
- }
167
- };
168
-
169
- // not zero
159
+ // key not zero
170
160
 
171
161
  template<typename Entry, typename ExtractKey>
172
162
  class key_not_zero {
@@ -195,12 +185,6 @@ static inline uint64_t compute_hash(const void* data, size_t length, uint64_t se
195
185
  return (hashes.h1 >> 1); // Java implementation does unsigned shift >>> to make values positive
196
186
  }
197
187
 
198
- static inline uint16_t compute_seed_hash(uint64_t seed) {
199
- HashState hashes;
200
- MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
201
- return hashes.h1;
202
- }
203
-
204
188
  // iterators
205
189
 
206
190
  template<typename Entry, typename ExtractKey>
@@ -17,6 +17,9 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
+ #ifndef THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
21
+ #define THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
22
+
20
23
  #include <iostream>
21
24
  #include <sstream>
22
25
  #include <algorithm>
@@ -69,7 +72,7 @@ entries_(nullptr)
69
72
 
70
73
  template<typename EN, typename EK, typename A>
71
74
  theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(theta_update_sketch_base&& other) noexcept:
72
- allocator_(other.allocator_),
75
+ allocator_(std::move(other.allocator_)),
73
76
  is_empty_(other.is_empty_),
74
77
  lg_cur_size_(other.lg_cur_size_),
75
78
  lg_nom_size_(other.lg_nom_size_),
@@ -387,3 +390,5 @@ auto theta_const_iterator<Entry, ExtractKey>::operator*() const -> const Entry&
387
390
  }
388
391
 
389
392
  } /* namespace datasketches */
393
+
394
+ #endif
@@ -42,4 +42,5 @@ target_sources(theta_test
42
42
  theta_union_test.cpp
43
43
  theta_intersection_test.cpp
44
44
  theta_a_not_b_test.cpp
45
+ theta_jaccard_similarity_test.cpp
45
46
  )
@@ -20,11 +20,10 @@
20
20
  #include <iostream>
21
21
 
22
22
  #include <catch.hpp>
23
- #include <jaccard_similarity.hpp>
24
23
 
25
- namespace datasketches {
24
+ #include "theta_jaccard_similarity.hpp"
26
25
 
27
- using update_theta_sketch = update_theta_sketch_experimental<>;
26
+ namespace datasketches {
28
27
 
29
28
  TEST_CASE("theta jaccard: empty", "[theta_sketch]") {
30
29
  auto sk_a = update_theta_sketch::builder().build();
@@ -17,10 +17,10 @@
17
17
  * under the License.
18
18
  */
19
19
 
20
- #include <catch.hpp>
21
20
  #include <fstream>
22
21
  #include <sstream>
23
22
 
23
+ #include <catch.hpp>
24
24
  #include <theta_sketch.hpp>
25
25
 
26
26
  namespace datasketches {
@@ -134,75 +134,7 @@ TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
134
134
  REQUIRE(compact_sketch.get_upper_bound(1) > n);
135
135
  }
136
136
 
137
- TEST_CASE("theta sketch: deserialize update empty from java as base", "[theta_sketch]") {
138
- std::ifstream is;
139
- is.exceptions(std::ios::failbit | std::ios::badbit);
140
- is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
141
- auto sketchptr = theta_sketch::deserialize(is);
142
- REQUIRE(sketchptr->is_empty());
143
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
144
- REQUIRE(sketchptr->get_num_retained() == 0);
145
- REQUIRE(sketchptr->get_theta() == 1.0);
146
- REQUIRE(sketchptr->get_estimate() == 0.0);
147
- REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
148
- REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
149
- }
150
-
151
- TEST_CASE("theta sketch: deserialize update empty from java as subclass", "[theta_sketch]") {
152
- std::ifstream is;
153
- is.exceptions(std::ios::failbit | std::ios::badbit);
154
- is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
155
- auto sketch = update_theta_sketch::deserialize(is);
156
- REQUIRE(sketch.is_empty());
157
- REQUIRE_FALSE(sketch.is_estimation_mode());
158
- REQUIRE(sketch.get_num_retained() == 0);
159
- REQUIRE(sketch.get_theta() == 1.0);
160
- REQUIRE(sketch.get_estimate() == 0.0);
161
- REQUIRE(sketch.get_lower_bound(1) == 0.0);
162
- REQUIRE(sketch.get_upper_bound(1) == 0.0);
163
- }
164
-
165
- TEST_CASE("theta sketch: deserialize update estimation from java as base", "[theta_sketch]") {
166
- std::ifstream is;
167
- is.exceptions(std::ios::failbit | std::ios::badbit);
168
- is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
169
- auto sketchptr = theta_sketch::deserialize(is);
170
- REQUIRE_FALSE(sketchptr->is_empty());
171
- REQUIRE(sketchptr->is_estimation_mode());
172
- REQUIRE(sketchptr->get_num_retained() == 5324);
173
- REQUIRE(sketchptr->get_estimate() == Approx(10000.0).margin(10000 * 0.01));
174
- REQUIRE(sketchptr->get_lower_bound(1) < 10000);
175
- REQUIRE(sketchptr->get_upper_bound(1) > 10000);
176
- }
177
-
178
- TEST_CASE("theta sketch: deserialize update estimation from java as subclass", "[theta_sketch]") {
179
- std::ifstream is;
180
- is.exceptions(std::ios::failbit | std::ios::badbit);
181
- is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
182
- auto sketch = update_theta_sketch::deserialize(is);
183
- REQUIRE_FALSE(sketch.is_empty());
184
- REQUIRE(sketch.is_estimation_mode());
185
- REQUIRE(sketch.get_num_retained() == 5324);
186
- REQUIRE(sketch.get_estimate() == Approx(10000.0).margin(10000 * 0.01));
187
- REQUIRE(sketch.get_lower_bound(1) < 10000);
188
- REQUIRE(sketch.get_upper_bound(1) > 10000);
189
- }
190
-
191
- TEST_CASE("theta sketch: deserialize compact empty from java as base", "[theta_sketch]") {
192
- std::ifstream is;
193
- is.exceptions(std::ios::failbit | std::ios::badbit);
194
- is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
195
- auto sketchptr = theta_sketch::deserialize(is);
196
- REQUIRE(sketchptr->is_empty());
197
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
198
- REQUIRE(sketchptr->get_num_retained() == 0);
199
- REQUIRE(sketchptr->get_theta() == 1.0);
200
- REQUIRE(sketchptr->get_estimate() == 0.0);
201
- REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
202
- REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
203
- }
204
-
205
- TEST_CASE("theta sketch: deserialize compact empty from java as subclass", "[theta_sketch]") {
137
+ TEST_CASE("theta sketch: deserialize compact empty from java", "[theta_sketch]") {
206
138
  std::ifstream is;
207
139
  is.exceptions(std::ios::failbit | std::ios::badbit);
208
140
  is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
@@ -216,21 +148,7 @@ TEST_CASE("theta sketch: deserialize compact empty from java as subclass", "[the
216
148
  REQUIRE(sketch.get_upper_bound(1) == 0.0);
217
149
  }
218
150
 
219
- TEST_CASE("theta sketch: deserialize single item from java as base", "[theta_sketch]") {
220
- std::ifstream is;
221
- is.exceptions(std::ios::failbit | std::ios::badbit);
222
- is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
223
- auto sketchptr = theta_sketch::deserialize(is);
224
- REQUIRE_FALSE(sketchptr->is_empty());
225
- REQUIRE_FALSE(sketchptr->is_estimation_mode());
226
- REQUIRE(sketchptr->get_num_retained() == 1);
227
- REQUIRE(sketchptr->get_theta() == 1.0);
228
- REQUIRE(sketchptr->get_estimate() == 1.0);
229
- REQUIRE(sketchptr->get_lower_bound(1) == 1.0);
230
- REQUIRE(sketchptr->get_upper_bound(1) == 1.0);
231
- }
232
-
233
- TEST_CASE("theta sketch: deserialize single item from java as subclass", "[theta_sketch]") {
151
+ TEST_CASE("theta sketch: deserialize single item from java", "[theta_sketch]") {
234
152
  std::ifstream is;
235
153
  is.exceptions(std::ios::failbit | std::ios::badbit);
236
154
  is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
@@ -244,55 +162,21 @@ TEST_CASE("theta sketch: deserialize single item from java as subclass", "[theta
244
162
  REQUIRE(sketch.get_upper_bound(1) == 1.0);
245
163
  }
246
164
 
247
- TEST_CASE("theta sketch: deserialize compact estimation from java as base", "[theta_sketch]") {
248
- std::ifstream is;
249
- is.exceptions(std::ios::failbit | std::ios::badbit);
250
- is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
251
- auto sketchptr = theta_sketch::deserialize(is);
252
- REQUIRE_FALSE(sketchptr->is_empty());
253
- REQUIRE(sketchptr->is_estimation_mode());
254
- REQUIRE(sketchptr->is_ordered());
255
- REQUIRE(sketchptr->get_num_retained() == 4342);
256
- REQUIRE(sketchptr->get_theta() == Approx(0.531700444213199).margin(1e-10));
257
- REQUIRE(sketchptr->get_estimate() == Approx(8166.25234614053).margin(1e-10));
258
- REQUIRE(sketchptr->get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
259
- REQUIRE(sketchptr->get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
260
-
261
- // the same construction process in Java must have produced exactly the same sketch
262
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
263
- const int n = 8192;
264
- for (int i = 0; i < n; i++) update_sketch.update(i);
265
- REQUIRE(sketchptr->get_num_retained() == update_sketch.get_num_retained());
266
- REQUIRE(sketchptr->get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
267
- REQUIRE(sketchptr->get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
268
- REQUIRE(sketchptr->get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
269
- REQUIRE(sketchptr->get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
270
- REQUIRE(sketchptr->get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
271
- REQUIRE(sketchptr->get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
272
- REQUIRE(sketchptr->get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
273
- REQUIRE(sketchptr->get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
274
- compact_theta_sketch compact_sketch = update_sketch.compact();
275
- // the sketches are ordered, so the iteration sequence must match exactly
276
- auto iter = sketchptr->begin();
277
- for (auto key: compact_sketch) {
278
- REQUIRE(*iter == key);
279
- ++iter;
280
- }
281
- }
282
-
283
- TEST_CASE("theta sketch: deserialize compact estimation from java as subclass", "[theta_sketch]") {
165
+ TEST_CASE("theta sketch: deserialize compact estimation from java", "[theta_sketch]") {
284
166
  std::ifstream is;
285
167
  is.exceptions(std::ios::failbit | std::ios::badbit);
286
168
  is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
287
169
  auto sketch = compact_theta_sketch::deserialize(is);
288
170
  REQUIRE_FALSE(sketch.is_empty());
289
171
  REQUIRE(sketch.is_estimation_mode());
172
+ REQUIRE(sketch.is_ordered());
290
173
  REQUIRE(sketch.get_num_retained() == 4342);
291
174
  REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
292
175
  REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
293
176
  REQUIRE(sketch.get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
294
177
  REQUIRE(sketch.get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
295
178
 
179
+ // the same construction process in Java must have produced exactly the same sketch
296
180
  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
297
181
  const int n = 8192;
298
182
  for (int i = 0; i < n; i++) update_sketch.update(i);
@@ -305,132 +189,51 @@ TEST_CASE("theta sketch: deserialize compact estimation from java as subclass",
305
189
  REQUIRE(sketch.get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
306
190
  REQUIRE(sketch.get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
307
191
  REQUIRE(sketch.get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
192
+ compact_theta_sketch compact_sketch = update_sketch.compact();
193
+ // the sketches are ordered, so the iteration sequence must match exactly
194
+ auto iter = sketch.begin();
195
+ for (const auto& key: compact_sketch) {
196
+ REQUIRE(*iter == key);
197
+ ++iter;
198
+ }
308
199
  }
309
200
 
310
- TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalency", "[theta_sketch]") {
201
+ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[theta_sketch]") {
311
202
  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
312
203
  const int n = 8192;
313
204
  for (int i = 0; i < n; i++) update_sketch.update(i);
314
205
 
315
- // update sketch stream and bytes comparison
316
- {
317
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
318
- update_sketch.serialize(s);
319
- auto bytes = update_sketch.serialize();
320
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
321
- for (size_t i = 0; i < bytes.size(); ++i) {
322
- REQUIRE(((char*)bytes.data())[i] == (char)s.get());
323
- }
324
-
325
- // deserialize as base class
326
- {
327
- s.seekg(0); // rewind
328
- auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
329
- auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
330
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
331
- REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
332
- REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
333
- REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
334
- REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
335
- REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
336
- REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
337
- REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
338
- // hash tables must be identical since they are restored from dumps, and iteration is deterministic
339
- auto iter = deserialized_sketch_ptr1->begin();
340
- for (auto key: *deserialized_sketch_ptr2) {
341
- REQUIRE(*iter == key);
342
- ++iter;
343
- }
344
- }
345
-
346
- // deserialize as subclass
347
- {
348
- s.seekg(0); // rewind
349
- update_theta_sketch deserialized_sketch1 = update_theta_sketch::deserialize(s);
350
- update_theta_sketch deserialized_sketch2 = update_theta_sketch::deserialize(bytes.data(), bytes.size());
351
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
352
- REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
353
- REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
354
- REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
355
- REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
356
- REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
357
- REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
358
- REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
359
- // hash tables must be identical since they are restored from dumps, and iteration is deterministic
360
- auto iter = deserialized_sketch1.begin();
361
- for (auto key: deserialized_sketch2) {
362
- REQUIRE(*iter == key);
363
- ++iter;
364
- }
365
- }
206
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
207
+ update_sketch.compact().serialize(s);
208
+ auto bytes = update_sketch.compact().serialize();
209
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
210
+ for (size_t i = 0; i < bytes.size(); ++i) {
211
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
366
212
  }
367
213
 
368
- // compact sketch stream and bytes comparison
369
- {
370
- std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
371
- update_sketch.compact().serialize(s);
372
- auto bytes = update_sketch.compact().serialize();
373
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
374
- for (size_t i = 0; i < bytes.size(); ++i) {
375
- REQUIRE(((char*)bytes.data())[i] == (char)s.get());
376
- }
377
-
378
- // deserialize as base class
379
- {
380
- s.seekg(0); // rewind
381
- auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
382
- auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
383
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
384
- REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
385
- REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
386
- REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
387
- REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
388
- REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
389
- REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
390
- REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
391
- // the sketches are ordered, so the iteration sequence must match exactly
392
- auto iter = deserialized_sketch_ptr1->begin();
393
- for (auto key: *deserialized_sketch_ptr2) {
394
- REQUIRE(*iter == key);
395
- ++iter;
396
- }
397
- }
398
-
399
- // deserialize as subclass
400
- {
401
- s.seekg(0); // rewind
402
- compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
403
- compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
404
- REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
405
- REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
406
- REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
407
- REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
408
- REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
409
- REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
410
- REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
411
- REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
412
- // the sketches are ordered, so the iteration sequence must match exactly
413
- auto iter = deserialized_sketch1.begin();
414
- for (auto key: deserialized_sketch2) {
415
- REQUIRE(*iter == key);
416
- ++iter;
417
- }
418
- }
214
+ s.seekg(0); // rewind
215
+ compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
216
+ compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
217
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
218
+ REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
219
+ REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
220
+ REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
221
+ REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
222
+ REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
223
+ REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
224
+ REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
225
+ // the sketches are ordered, so the iteration sequence must match exactly
226
+ auto iter = deserialized_sketch1.begin();
227
+ for (auto key: deserialized_sketch2) {
228
+ REQUIRE(*iter == key);
229
+ ++iter;
419
230
  }
420
231
  }
421
232
 
422
- TEST_CASE("theta sketch: deserialize update single item buffer overrun", "[theta_sketch]") {
423
- update_theta_sketch update_sketch = update_theta_sketch::builder().build();
424
- update_sketch.update(1);
425
- theta_sketch::vector_bytes bytes = update_sketch.serialize();
426
- REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
427
- REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
428
- }
429
-
430
233
  TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") {
431
234
  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
432
235
  update_sketch.update(1);
433
- theta_sketch::vector_bytes bytes = update_sketch.compact().serialize();
236
+ auto bytes = update_sketch.compact().serialize();
434
237
  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
435
238
  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
436
239
  }