datasketches 0.1.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (205) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  6. data/ext/datasketches/ext.cpp +1 -1
  7. data/ext/datasketches/ext.h +4 -0
  8. data/ext/datasketches/extconf.rb +1 -1
  9. data/ext/datasketches/fi_wrapper.cpp +6 -8
  10. data/ext/datasketches/hll_wrapper.cpp +13 -14
  11. data/ext/datasketches/kll_wrapper.cpp +28 -76
  12. data/ext/datasketches/theta_wrapper.cpp +27 -41
  13. data/ext/datasketches/vo_wrapper.cpp +4 -6
  14. data/lib/datasketches/version.rb +1 -1
  15. data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
  16. data/vendor/datasketches-cpp/LICENSE +40 -3
  17. data/vendor/datasketches-cpp/NOTICE +1 -1
  18. data/vendor/datasketches-cpp/README.md +4 -4
  19. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
  20. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  21. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  24. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  25. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  26. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  27. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  28. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  29. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
  31. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
  32. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
  33. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
  34. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
  35. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  36. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
  37. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
  38. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  42. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  43. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  46. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  47. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
  48. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  49. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
  50. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  51. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
  52. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  53. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
  54. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
  55. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  56. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
  57. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
  58. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
  59. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
  60. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
  61. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  62. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  63. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  64. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
  76. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
  77. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
  78. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
  79. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  80. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  81. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  82. data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
  83. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
  84. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
  85. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
  86. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  87. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
  88. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
  89. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  90. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  91. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  92. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  93. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  94. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  95. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
  96. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
  97. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
  98. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  99. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  100. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  101. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  102. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
  103. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  104. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  105. data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
  106. data/vendor/datasketches-cpp/python/README.md +52 -49
  107. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  108. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  109. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  110. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
  111. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
  112. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  113. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
  114. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  115. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  116. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
  117. data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
  118. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  119. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  120. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  121. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  122. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  123. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
  124. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  125. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
  126. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  127. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  128. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  129. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  130. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  131. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  132. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  133. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  134. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  135. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  136. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  137. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  138. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
  139. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
  140. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  141. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
  142. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  143. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  144. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  145. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
  146. data/vendor/datasketches-cpp/setup.py +11 -6
  147. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  148. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
  149. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  150. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  151. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  152. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  153. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  154. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  155. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
  156. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  157. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
  158. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  159. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  160. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  161. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  162. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
  163. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  164. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  165. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
  166. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
  167. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  168. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  169. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
  170. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  171. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
  172. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
  173. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  174. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  175. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  176. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
  177. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
  178. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  179. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  180. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  181. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  182. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  183. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
  184. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  185. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  186. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
  187. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
  188. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
  189. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
  190. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  191. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
  192. metadata +51 -36
  193. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  194. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  195. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  196. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  197. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  198. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  199. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  200. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  201. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  202. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  203. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  204. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  205. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -20,45 +20,29 @@
20
20
  #ifndef THETA_SKETCH_HPP_
21
21
  #define THETA_SKETCH_HPP_
22
22
 
23
- #include <memory>
24
- #include <functional>
25
- #include <climits>
26
- #include <vector>
27
-
28
- #include "common_defs.hpp"
23
+ #include "theta_update_sketch_base.hpp"
29
24
 
30
25
  namespace datasketches {
31
26
 
32
- /*
33
- * author Alexander Saydakov
34
- * author Lee Rhodes
35
- * author Kevin Lang
36
- */
37
-
38
- // forward-declarations
39
- template<typename A> class theta_sketch_alloc;
40
- template<typename A> class update_theta_sketch_alloc;
41
- template<typename A> class compact_theta_sketch_alloc;
42
- template<typename A> class theta_union_alloc;
43
- template<typename A> class theta_intersection_alloc;
44
- template<typename A> class theta_a_not_b_alloc;
45
-
46
- // for serialization as raw bytes
47
- template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
48
- template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
49
-
50
- template<typename A>
27
+ template<typename Allocator = std::allocator<uint64_t>>
51
28
  class theta_sketch_alloc {
52
29
  public:
53
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
54
- static const uint8_t SERIAL_VERSION = 3;
30
+ using Entry = uint64_t;
31
+ using ExtractKey = trivial_extract_key;
32
+ using iterator = theta_iterator<Entry, ExtractKey>;
33
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
55
34
 
56
35
  virtual ~theta_sketch_alloc() = default;
57
36
 
37
+ /**
38
+ * @return allocator
39
+ */
40
+ virtual Allocator get_allocator() const = 0;
41
+
58
42
  /**
59
43
  * @return true if this sketch represents an empty set (not the same as no retained entries!)
60
44
  */
61
- bool is_empty() const;
45
+ virtual bool is_empty() const = 0;
62
46
 
63
47
  /**
64
48
  * @return estimate of the distinct count of the input stream
@@ -96,13 +80,16 @@ public:
96
80
  /**
97
81
  * @return theta as a positive integer between 0 and LLONG_MAX
98
82
  */
99
- uint64_t get_theta64() const;
83
+ virtual uint64_t get_theta64() const = 0;
100
84
 
101
85
  /**
102
86
  * @return the number of retained entries in the sketch
103
87
  */
104
88
  virtual uint32_t get_num_retained() const = 0;
105
89
 
90
+ /**
91
+ * @return hash of the seed that was used to hash the input
92
+ */
106
93
  virtual uint16_t get_seed_hash() const = 0;
107
94
 
108
95
  /**
@@ -111,109 +98,82 @@ public:
111
98
  virtual bool is_ordered() const = 0;
112
99
 
113
100
  /**
114
- * Writes a human-readable summary of this sketch to a given stream
101
+ * Provides a human-readable summary of this sketch as a string
115
102
  * @param print_items if true include the list of items retained by the sketch
103
+ * @return sketch summary as a string
116
104
  */
117
- virtual string<A> to_string(bool print_items = false) const = 0;
105
+ virtual string<Allocator> to_string(bool print_items = false) const;
118
106
 
119
107
  /**
120
- * This method serializes the sketch into a given stream in a binary form
121
- * @param os output stream
122
- */
123
- virtual void serialize(std::ostream& os) const = 0;
124
-
125
- // This is a convenience alias for users
126
- // The type returned by the following serialize method
127
- typedef vector_u8<A> vector_bytes;
128
-
129
- /**
130
- * This method serializes the sketch as a vector of bytes.
131
- * An optional header can be reserved in front of the sketch.
132
- * It is an uninitialized space of a given size.
133
- * This header is used in Datasketches PostgreSQL extension.
134
- * @param header_size_bytes space to reserve in front of the sketch
135
- */
136
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
137
-
138
- // This is a convenience alias for users
139
- // The type returned by the following deserialize methods
140
- // It is not possible to return instances of an abstract type, so this has to be a pointer
141
- typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
142
-
143
- /**
144
- * This method deserializes a sketch from a given stream.
145
- * @param is input stream
146
- * @param seed the seed for the hash function that was used to create the sketch
147
- * @return an instance of a sketch as a unique_ptr
108
+ * Iterator over hash values in this sketch.
109
+ * @return begin iterator
148
110
  */
149
- static unique_ptr deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
111
+ virtual iterator begin() = 0;
150
112
 
151
113
  /**
152
- * This method deserializes a sketch from a given array of bytes.
153
- * @param bytes pointer to the array of bytes
154
- * @param size the size of the array
155
- * @param seed the seed for the hash function that was used to create the sketch
156
- * @return an instance of the sketch
114
+ * Iterator pointing past the valid range.
115
+ * Not to be incremented or dereferenced.
116
+ * @return end iterator
157
117
  */
158
- static unique_ptr deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
159
-
160
- class const_iterator;
118
+ virtual iterator end() = 0;
161
119
 
162
120
  /**
163
- * Iterator over hash values in this sketch.
121
+ * Const iterator over hash values in this sketch.
164
122
  * @return begin iterator
165
123
  */
166
124
  virtual const_iterator begin() const = 0;
167
125
 
168
126
  /**
169
- * Iterator pointing past the valid range.
127
+ * Const iterator pointing past the valid range.
170
128
  * Not to be incremented or dereferenced.
171
129
  * @return end iterator
172
130
  */
173
131
  virtual const_iterator end() const = 0;
174
132
 
175
133
  protected:
176
- enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
177
-
178
- bool is_empty_;
179
- uint64_t theta_;
180
-
181
- theta_sketch_alloc(bool is_empty, uint64_t theta);
182
-
183
- static uint16_t get_seed_hash(uint64_t seed);
184
-
185
- static void check_sketch_type(uint8_t actual, uint8_t expected);
186
- static void check_serial_version(uint8_t actual, uint8_t expected);
187
- static void check_seed_hash(uint16_t actual, uint16_t expected);
188
-
189
- friend theta_intersection_alloc<A>;
190
- friend theta_a_not_b_alloc<A>;
134
+ using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
135
+ virtual void print_specifics(ostrstream& os) const = 0;
191
136
  };
192
137
 
193
- // update sketch
194
-
195
- template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
196
- template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
138
+ // forward declaration
139
+ template<typename A> class compact_theta_sketch_alloc;
197
140
 
198
- template<typename A>
199
- class update_theta_sketch_alloc: public theta_sketch_alloc<A> {
141
+ template<typename Allocator = std::allocator<uint64_t>>
142
+ class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
200
143
  public:
201
- class builder;
202
- enum resize_factor { X1, X2, X4, X8 };
203
- static const uint8_t SKETCH_TYPE = 2;
144
+ using Base = theta_sketch_alloc<Allocator>;
145
+ using Entry = typename Base::Entry;
146
+ using ExtractKey = typename Base::ExtractKey;
147
+ using iterator = typename Base::iterator;
148
+ using const_iterator = typename Base::const_iterator;
149
+ using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
150
+ using resize_factor = typename theta_table::resize_factor;
204
151
 
205
152
  // No constructor here. Use builder instead.
153
+ class builder;
206
154
 
155
+ update_theta_sketch_alloc(const update_theta_sketch_alloc&) = default;
156
+ update_theta_sketch_alloc(update_theta_sketch_alloc&&) noexcept = default;
207
157
  virtual ~update_theta_sketch_alloc() = default;
158
+ update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc&) = default;
159
+ update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&&) = default;
208
160
 
209
- virtual uint32_t get_num_retained() const;
210
- virtual uint16_t get_seed_hash() const;
161
+ virtual Allocator get_allocator() const;
162
+ virtual bool is_empty() const;
211
163
  virtual bool is_ordered() const;
212
- virtual string<A> to_string(bool print_items = false) const;
213
- virtual void serialize(std::ostream& os) const;
214
- typedef vector_u8<A> vector_bytes; // alias for users
215
- // header space is reserved, but not initialized
216
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
164
+ virtual uint16_t get_seed_hash() const;
165
+ virtual uint64_t get_theta64() const;
166
+ virtual uint32_t get_num_retained() const;
167
+
168
+ /**
169
+ * @return configured nominal number of entries in the sketch
170
+ */
171
+ uint8_t get_lg_k() const;
172
+
173
+ /**
174
+ * @return configured resize factor of the sketch
175
+ */
176
+ resize_factor get_rf() const;
217
177
 
218
178
  /**
219
179
  * Update this sketch with a given string.
@@ -302,7 +262,7 @@ public:
302
262
  * @param data pointer to the data
303
263
  * @param length of the data in bytes
304
264
  */
305
- void update(const void* data, unsigned length);
265
+ void update(const void* data, size_t length);
306
266
 
307
267
  /**
308
268
  * Remove retained entries in excess of the nominal size k (if any)
@@ -314,105 +274,86 @@ public:
314
274
  * @param ordered optional flag to specify if ordered sketch should be produced
315
275
  * @return compact sketch
316
276
  */
317
- compact_theta_sketch_alloc<A> compact(bool ordered = true) const;
318
-
319
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
320
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
321
-
322
- /**
323
- * This method deserializes a sketch from a given stream.
324
- * @param is input stream
325
- * @param seed the seed for the hash function that was used to create the sketch
326
- * @return an instance of a sketch
327
- */
328
- static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
277
+ compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
329
278
 
330
- /**
331
- * This method deserializes a sketch from a given array of bytes.
332
- * @param bytes pointer to the array of bytes
333
- * @param size the size of the array
334
- * @param seed the seed for the hash function that was used to create the sketch
335
- * @return an instance of the sketch
336
- */
337
- static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
279
+ virtual iterator begin();
280
+ virtual iterator end();
281
+ virtual const_iterator begin() const;
282
+ virtual const_iterator end() const;
338
283
 
339
284
  private:
340
- // resize threshold = 0.5 tuned for speed
341
- static constexpr double RESIZE_THRESHOLD = 0.5;
342
- // hash table rebuild threshold = 15/16
343
- static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
344
-
345
- static constexpr uint8_t STRIDE_HASH_BITS = 7;
346
- static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
347
-
348
- uint8_t lg_cur_size_;
349
- uint8_t lg_nom_size_;
350
- vector_u64<A> keys_;
351
- uint32_t num_keys_;
352
- resize_factor rf_;
353
- float p_;
354
- uint64_t seed_;
355
- uint32_t capacity_;
285
+ theta_table table_;
356
286
 
357
287
  // for builder
358
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
359
-
360
- // for deserialize
361
- update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
362
-
363
- void resize();
364
- void rebuild();
288
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
289
+ uint64_t seed, const Allocator& allocator);
365
290
 
366
- friend theta_union_alloc<A>;
367
- void internal_update(uint64_t hash);
368
-
369
- friend theta_intersection_alloc<A>;
370
- friend theta_a_not_b_alloc<A>;
371
- static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
372
- static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
373
- static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
374
- static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
375
-
376
- friend theta_sketch_alloc<A>;
377
- static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
378
- static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
291
+ using ostrstream = typename Base::ostrstream;
292
+ virtual void print_specifics(ostrstream& os) const;
379
293
  };
380
294
 
381
295
  // compact sketch
382
296
 
383
- template<typename A>
384
- class compact_theta_sketch_alloc: public theta_sketch_alloc<A> {
297
+ template<typename Allocator = std::allocator<uint64_t>>
298
+ class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
385
299
  public:
300
+ using Base = theta_sketch_alloc<Allocator>;
301
+ using iterator = typename Base::iterator;
302
+ using const_iterator = typename Base::const_iterator;
303
+ using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
304
+ using vector_bytes = std::vector<uint8_t, AllocBytes>;
305
+
306
+ static const uint8_t SERIAL_VERSION = 3;
386
307
  static const uint8_t SKETCH_TYPE = 3;
387
308
 
388
- // No constructor here.
389
309
  // Instances of this type can be obtained:
390
- // - by compacting an update_theta_sketch
310
+ // - by compacting an update_theta_sketch_alloc
391
311
  // - as a result of a set operation
392
312
  // - by deserializing a previously serialized compact sketch
393
313
 
394
- compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered);
314
+ template<typename Other>
315
+ compact_theta_sketch_alloc(const Other& other, bool ordered);
316
+ compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
317
+ compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
395
318
  virtual ~compact_theta_sketch_alloc() = default;
319
+ compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc&) = default;
320
+ compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&&) = default;
396
321
 
322
+ virtual Allocator get_allocator() const;
323
+ virtual bool is_empty() const;
324
+ virtual bool is_ordered() const;
325
+ virtual uint64_t get_theta64() const;
397
326
  virtual uint32_t get_num_retained() const;
398
327
  virtual uint16_t get_seed_hash() const;
399
- virtual bool is_ordered() const;
400
- virtual string<A> to_string(bool print_items = false) const;
401
- virtual void serialize(std::ostream& os) const;
402
- typedef vector_u8<A> vector_bytes; // alias for users
403
- // header space is reserved, but not initialized
404
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
405
328
 
406
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
407
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
329
+ /**
330
+ * This method serializes the sketch into a given stream in a binary form
331
+ * @param os output stream
332
+ */
333
+ void serialize(std::ostream& os) const;
334
+
335
+ /**
336
+ * This method serializes the sketch as a vector of bytes.
337
+ * An optional header can be reserved in front of the sketch.
338
+ * It is an uninitialized space of a given size.
339
+ * This header is used in Datasketches PostgreSQL extension.
340
+ * @param header_size_bytes space to reserve in front of the sketch
341
+ */
342
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
343
+
344
+ virtual iterator begin();
345
+ virtual iterator end();
346
+ virtual const_iterator begin() const;
347
+ virtual const_iterator end() const;
408
348
 
409
349
  /**
410
350
  * This method deserializes a sketch from a given stream.
411
351
  * @param is input stream
412
352
  * @param seed the seed for the hash function that was used to create the sketch
413
- * @return an instance of a sketch
353
+ * @return an instance of the sketch
414
354
  */
415
- static compact_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
355
+ static compact_theta_sketch_alloc deserialize(std::istream& is,
356
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
416
357
 
417
358
  /**
418
359
  * This method deserializes a sketch from a given array of bytes.
@@ -421,110 +362,76 @@ public:
421
362
  * @param seed the seed for the hash function that was used to create the sketch
422
363
  * @return an instance of the sketch
423
364
  */
424
- static compact_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
365
+ static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
366
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
367
+
368
+ // for internal use
369
+ compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
425
370
 
426
371
  private:
427
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
372
+ enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
428
373
 
429
- vector_u64<A> keys_;
430
- uint16_t seed_hash_;
374
+ bool is_empty_;
431
375
  bool is_ordered_;
376
+ uint16_t seed_hash_;
377
+ uint64_t theta_;
378
+ std::vector<uint64_t, Allocator> entries_;
432
379
 
433
- friend theta_sketch_alloc<A>;
434
- friend update_theta_sketch_alloc<A>;
435
- friend theta_union_alloc<A>;
436
- friend theta_intersection_alloc<A>;
437
- friend theta_a_not_b_alloc<A>;
438
- compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
439
- static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
440
- static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
380
+ using ostrstream = typename Base::ostrstream;
381
+ virtual void print_specifics(ostrstream& os) const;
441
382
  };
442
383
 
443
- // builder
444
-
445
- template<typename A>
446
- class update_theta_sketch_alloc<A>::builder {
384
+ template<typename Allocator>
385
+ class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
447
386
  public:
448
- static const uint8_t MIN_LG_K = 5;
449
- static const uint8_t DEFAULT_LG_K = 12;
450
- static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
451
-
452
- /**
453
- * Creates and instance of the builder with default parameters.
454
- */
455
- builder();
387
+ builder(const Allocator& allocator = Allocator());
388
+ update_theta_sketch_alloc build() const;
389
+ };
456
390
 
457
- /**
458
- * Set log2(k), where k is a nominal number of entries in the sketch
459
- * @param lg_k base 2 logarithm of nominal number of entries
460
- * @return this builder
461
- */
462
- builder& set_lg_k(uint8_t lg_k);
391
+ // This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
392
+ // It does not take the ownership of the buffer.
463
393
 
464
- /**
465
- * Set resize factor for the internal hash table (defaults to 8)
466
- * @param rf resize factor
467
- * @return this builder
468
- */
469
- builder& set_resize_factor(resize_factor rf);
394
+ template<typename Allocator = std::allocator<uint64_t>>
395
+ class wrapped_compact_theta_sketch_alloc {
396
+ public:
397
+ using const_iterator = const uint64_t*;
470
398
 
471
- /**
472
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
473
- * all entries until it reaches the limit, at which point it goes into the estimation mode
474
- * and reduces the effective sampling probability (theta) as necessary.
475
- * @param p sampling probability
476
- * @return this builder
477
- */
478
- builder& set_p(float p);
399
+ Allocator get_allocator() const;
400
+ bool is_empty() const;
401
+ bool is_ordered() const;
402
+ uint64_t get_theta64() const;
403
+ uint32_t get_num_retained() const;
404
+ uint16_t get_seed_hash() const;
479
405
 
480
- /**
481
- * Set the seed for the hash function. Should be used carefully if needed.
482
- * Sketches produced with different seed are not compatible
483
- * and cannot be mixed in set operations.
484
- * @param seed hash seed
485
- * @return this builder
486
- */
487
- builder& set_seed(uint64_t seed);
406
+ const_iterator begin() const;
407
+ const_iterator end() const;
488
408
 
489
409
  /**
490
- * This is to create an instance of the sketch with predefined parameters.
491
- * @return and instance of the sketch
410
+ * This method wraps a serialized compact sketch as an array of bytes.
411
+ * @param bytes pointer to the array of bytes
412
+ * @param size the size of the array
413
+ * @param seed the seed for the hash function that was used to create the sketch
414
+ * @return an instance of the sketch
492
415
  */
493
- update_theta_sketch_alloc<A> build() const;
416
+ static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
494
417
 
495
418
  private:
496
- uint8_t lg_k_;
497
- resize_factor rf_;
498
- float p_;
499
- uint64_t seed_;
500
-
501
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
502
- };
503
-
504
- // iterator
505
- template<typename A>
506
- class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
507
- public:
508
- const_iterator& operator++();
509
- const_iterator operator++(int);
510
- bool operator==(const const_iterator& other) const;
511
- bool operator!=(const const_iterator& other) const;
512
- uint64_t operator*() const;
419
+ bool is_empty_;
420
+ bool is_ordered_;
421
+ uint16_t seed_hash_;
422
+ uint32_t num_entries_;
423
+ uint64_t theta_;
424
+ const uint64_t* entries_;
513
425
 
514
- private:
515
- const uint64_t* keys_;
516
- uint32_t size_;
517
- uint32_t index_;
518
- const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
519
- friend class update_theta_sketch_alloc<A>;
520
- friend class compact_theta_sketch_alloc<A>;
426
+ wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
427
+ uint64_t theta, const uint64_t* entries);
521
428
  };
522
429
 
523
-
524
430
  // aliases with default allocator for convenience
525
- typedef theta_sketch_alloc<std::allocator<void>> theta_sketch;
526
- typedef update_theta_sketch_alloc<std::allocator<void>> update_theta_sketch;
527
- typedef compact_theta_sketch_alloc<std::allocator<void>> compact_theta_sketch;
431
+ using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
432
+ using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
433
+ using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
434
+ using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
528
435
 
529
436
  } /* namespace datasketches */
530
437