datasketches 0.1.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (205) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  6. data/ext/datasketches/ext.cpp +1 -1
  7. data/ext/datasketches/ext.h +4 -0
  8. data/ext/datasketches/extconf.rb +1 -1
  9. data/ext/datasketches/fi_wrapper.cpp +6 -8
  10. data/ext/datasketches/hll_wrapper.cpp +13 -14
  11. data/ext/datasketches/kll_wrapper.cpp +28 -76
  12. data/ext/datasketches/theta_wrapper.cpp +27 -41
  13. data/ext/datasketches/vo_wrapper.cpp +4 -6
  14. data/lib/datasketches/version.rb +1 -1
  15. data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
  16. data/vendor/datasketches-cpp/LICENSE +40 -3
  17. data/vendor/datasketches-cpp/NOTICE +1 -1
  18. data/vendor/datasketches-cpp/README.md +4 -4
  19. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
  20. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  21. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  24. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  25. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  26. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  27. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  28. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  29. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
  31. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
  32. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
  33. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
  34. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
  35. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  36. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
  37. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
  38. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  42. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  43. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  46. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  47. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
  48. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  49. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
  50. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  51. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
  52. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  53. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
  54. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
  55. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  56. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
  57. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
  58. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
  59. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
  60. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
  61. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  62. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  63. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  64. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
  76. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
  77. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
  78. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
  79. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  80. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  81. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  82. data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
  83. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
  84. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
  85. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
  86. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  87. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
  88. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
  89. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  90. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  91. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  92. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  93. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  94. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  95. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
  96. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
  97. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
  98. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  99. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  100. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  101. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  102. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
  103. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  104. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  105. data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
  106. data/vendor/datasketches-cpp/python/README.md +52 -49
  107. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  108. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  109. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  110. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
  111. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
  112. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  113. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
  114. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  115. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  116. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
  117. data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
  118. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  119. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  120. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  121. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  122. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  123. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
  124. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  125. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
  126. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  127. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  128. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  129. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  130. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  131. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  132. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  133. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  134. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  135. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  136. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  137. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  138. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
  139. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
  140. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  141. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
  142. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  143. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  144. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  145. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
  146. data/vendor/datasketches-cpp/setup.py +11 -6
  147. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  148. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
  149. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  150. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  151. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  152. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  153. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  154. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  155. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
  156. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  157. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
  158. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  159. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  160. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  161. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  162. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
  163. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  164. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  165. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
  166. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
  167. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  168. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  169. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
  170. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  171. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
  172. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
  173. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  174. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  175. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  176. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
  177. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
  178. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  179. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  180. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  181. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  182. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  183. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
  184. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  185. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  186. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
  187. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
  188. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
  189. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
  190. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  191. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
  192. metadata +51 -36
  193. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  194. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  195. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  196. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  197. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  198. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  199. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  200. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  201. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  202. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  203. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  204. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  205. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -20,35 +20,24 @@
20
20
  #ifndef THETA_SKETCH_IMPL_HPP_
21
21
  #define THETA_SKETCH_IMPL_HPP_
22
22
 
23
- #include <algorithm>
24
- #include <cmath>
25
- #include <memory>
26
- #include <functional>
27
- #include <istream>
28
- #include <ostream>
29
23
  #include <sstream>
24
+ #include <vector>
30
25
 
31
- #include "MurmurHash3.h"
32
26
  #include "serde.hpp"
33
27
  #include "binomial_bounds.hpp"
34
- #include "memory_operations.hpp"
28
+ #include "theta_helpers.hpp"
29
+ #include "compact_theta_sketch_parser.hpp"
35
30
 
36
31
  namespace datasketches {
37
32
 
38
- /*
39
- * author Alexander Saydakov
40
- * author Lee Rhodes
41
- * author Kevin Lang
42
- */
43
-
44
33
  template<typename A>
45
- theta_sketch_alloc<A>::theta_sketch_alloc(bool is_empty, uint64_t theta):
46
- is_empty_(is_empty), theta_(theta)
47
- {}
34
+ bool theta_sketch_alloc<A>::is_estimation_mode() const {
35
+ return get_theta64() < theta_constants::MAX_THETA && !is_empty();
36
+ }
48
37
 
49
38
  template<typename A>
50
- bool theta_sketch_alloc<A>::is_empty() const {
51
- return is_empty_;
39
+ double theta_sketch_alloc<A>::get_theta() const {
40
+ return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
52
41
  }
53
42
 
54
43
  template<typename A>
@@ -69,182 +58,47 @@ double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
69
58
  }
70
59
 
71
60
  template<typename A>
72
- bool theta_sketch_alloc<A>::is_estimation_mode() const {
73
- return theta_ < MAX_THETA && !is_empty_;
74
- }
75
-
76
- template<typename A>
77
- double theta_sketch_alloc<A>::get_theta() const {
78
- return (double) theta_ / MAX_THETA;
79
- }
80
-
81
- template<typename A>
82
- uint64_t theta_sketch_alloc<A>::get_theta64() const {
83
- return theta_;
84
- }
85
-
86
- template<typename A>
87
- typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
88
- uint8_t preamble_longs;
89
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
90
- uint8_t serial_version;
91
- is.read((char*)&serial_version, sizeof(serial_version));
92
- uint8_t type;
93
- is.read((char*)&type, sizeof(type));
94
- uint8_t lg_nom_size;
95
- is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
96
- uint8_t lg_cur_size;
97
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
98
- uint8_t flags_byte;
99
- is.read((char*)&flags_byte, sizeof(flags_byte));
100
- uint16_t seed_hash;
101
- is.read((char*)&seed_hash, sizeof(seed_hash));
102
-
103
- check_serial_version(serial_version, SERIAL_VERSION);
104
-
105
- if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
106
- check_seed_hash(seed_hash, get_seed_hash(seed));
107
- typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
108
- typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
109
- return unique_ptr(
110
- static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(update_theta_sketch_alloc<A>::internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed))),
111
- [](theta_sketch_alloc<A>* ptr) {
112
- ptr->~theta_sketch_alloc();
113
- AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
114
- }
115
- );
116
- } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
117
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
118
- if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
119
- typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
120
- return unique_ptr(
121
- static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
122
- [](theta_sketch_alloc<A>* ptr) {
123
- ptr->~theta_sketch_alloc();
124
- AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
125
- }
126
- );
127
- }
128
- throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
129
- }
130
-
131
- template<typename A>
132
- typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
133
- ensure_minimum_memory(size, static_cast<size_t>(8));
134
- const char* ptr = static_cast<const char*>(bytes);
135
- uint8_t preamble_longs;
136
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
137
- uint8_t serial_version;
138
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
139
- uint8_t type;
140
- ptr += copy_from_mem(ptr, &type, sizeof(type));
141
- uint8_t lg_nom_size;
142
- ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
143
- uint8_t lg_cur_size;
144
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
145
- uint8_t flags_byte;
146
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
147
- uint16_t seed_hash;
148
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
149
-
150
- check_serial_version(serial_version, SERIAL_VERSION);
151
-
152
- if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
153
- check_seed_hash(seed_hash, get_seed_hash(seed));
154
- typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
155
- typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
156
- return unique_ptr(
157
- static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(
158
- update_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed))
159
- ),
160
- [](theta_sketch_alloc<A>* ptr) {
161
- ptr->~theta_sketch_alloc();
162
- AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
163
- }
164
- );
165
- } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
166
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
167
- if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
168
- typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
169
- return unique_ptr(
170
- static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
171
- compact_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash))
172
- ),
173
- [](theta_sketch_alloc<A>* ptr) {
174
- ptr->~theta_sketch_alloc();
175
- AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
176
- }
177
- );
178
- }
179
- throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
180
- }
181
-
182
- template<typename A>
183
- uint16_t theta_sketch_alloc<A>::get_seed_hash(uint64_t seed) {
184
- HashState hashes;
185
- MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
186
- return hashes.h1;
187
- }
188
-
189
- template<typename A>
190
- void theta_sketch_alloc<A>::check_sketch_type(uint8_t actual, uint8_t expected) {
191
- if (actual != expected) {
192
- throw std::invalid_argument("Sketch type mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
193
- }
194
- }
195
-
196
- template<typename A>
197
- void theta_sketch_alloc<A>::check_serial_version(uint8_t actual, uint8_t expected) {
198
- if (actual != expected) {
199
- throw std::invalid_argument("Sketch serial version mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
200
- }
201
- }
202
-
203
- template<typename A>
204
- void theta_sketch_alloc<A>::check_seed_hash(uint16_t actual, uint16_t expected) {
205
- if (actual != expected) {
206
- throw std::invalid_argument("Sketch seed hash mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
61
+ string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
62
+ ostrstream os;
63
+ os << "### Theta sketch summary:" << std::endl;
64
+ os << " num retained entries : " << get_num_retained() << std::endl;
65
+ os << " seed hash : " << get_seed_hash() << std::endl;
66
+ os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
67
+ os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
68
+ os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
69
+ os << " theta (fraction) : " << get_theta() << std::endl;
70
+ os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
71
+ os << " estimate : " << this->get_estimate() << std::endl;
72
+ os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
73
+ os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
74
+ print_specifics(os);
75
+ os << "### End sketch summary" << std::endl;
76
+ if (detail) {
77
+ os << "### Retained entries" << std::endl;
78
+ for (const auto& hash: *this) {
79
+ os << hash << std::endl;
80
+ }
81
+ os << "### End retained entries" << std::endl;
207
82
  }
83
+ return os.str();
208
84
  }
209
85
 
210
86
  // update sketch
211
87
 
212
88
  template<typename A>
213
- update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed):
214
- theta_sketch_alloc<A>(true, theta_sketch_alloc<A>::MAX_THETA),
215
- lg_cur_size_(lg_cur_size),
216
- lg_nom_size_(lg_nom_size),
217
- keys_(1 << lg_cur_size_, 0),
218
- num_keys_(0),
219
- rf_(rf),
220
- p_(p),
221
- seed_(seed),
222
- capacity_(get_capacity(lg_cur_size, lg_nom_size))
223
- {
224
- if (p < 1) this->theta_ *= p;
225
- }
226
-
227
- template<typename A>
228
- update_theta_sketch_alloc<A>::update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed):
229
- theta_sketch_alloc<A>(is_empty, theta),
230
- lg_cur_size_(lg_cur_size),
231
- lg_nom_size_(lg_nom_size),
232
- keys_(std::move(keys)),
233
- num_keys_(num_keys),
234
- rf_(rf),
235
- p_(p),
236
- seed_(seed),
237
- capacity_(get_capacity(lg_cur_size, lg_nom_size))
89
+ update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
90
+ uint64_t theta, uint64_t seed, const A& allocator):
91
+ table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
238
92
  {}
239
93
 
240
94
  template<typename A>
241
- uint32_t update_theta_sketch_alloc<A>::get_num_retained() const {
242
- return num_keys_;
95
+ A update_theta_sketch_alloc<A>::get_allocator() const {
96
+ return table_.allocator_;
243
97
  }
244
98
 
245
99
  template<typename A>
246
- uint16_t update_theta_sketch_alloc<A>::get_seed_hash() const {
247
- return theta_sketch_alloc<A>::get_seed_hash(seed_);
100
+ bool update_theta_sketch_alloc<A>::is_empty() const {
101
+ return table_.is_empty_;
248
102
  }
249
103
 
250
104
  template<typename A>
@@ -253,169 +107,28 @@ bool update_theta_sketch_alloc<A>::is_ordered() const {
253
107
  }
254
108
 
255
109
  template<typename A>
256
- string<A> update_theta_sketch_alloc<A>::to_string(bool print_items) const {
257
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
258
- os << "### Update Theta sketch summary:" << std::endl;
259
- os << " lg nominal size : " << (int) lg_nom_size_ << std::endl;
260
- os << " lg current size : " << (int) lg_cur_size_ << std::endl;
261
- os << " num retained keys : " << num_keys_ << std::endl;
262
- os << " resize factor : " << (1 << rf_) << std::endl;
263
- os << " sampling probability : " << p_ << std::endl;
264
- os << " seed hash : " << this->get_seed_hash() << std::endl;
265
- os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
266
- os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
267
- os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
268
- os << " theta (fraction) : " << this->get_theta() << std::endl;
269
- os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
270
- os << " estimate : " << this->get_estimate() << std::endl;
271
- os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
272
- os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
273
- os << "### End sketch summary" << std::endl;
274
- if (print_items) {
275
- os << "### Retained keys" << std::endl;
276
- for (auto key: *this) os << " " << key << std::endl;
277
- os << "### End retained keys" << std::endl;
278
- }
279
- return os.str();
280
- }
281
-
282
- template<typename A>
283
- void update_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
284
- const uint8_t preamble_longs_and_rf = 3 | (rf_ << 6);
285
- os.write((char*)&preamble_longs_and_rf, sizeof(preamble_longs_and_rf));
286
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
287
- os.write((char*)&serial_version, sizeof(serial_version));
288
- const uint8_t type = SKETCH_TYPE;
289
- os.write((char*)&type, sizeof(type));
290
- os.write((char*)&lg_nom_size_, sizeof(lg_nom_size_));
291
- os.write((char*)&lg_cur_size_, sizeof(lg_cur_size_));
292
- const uint8_t flags_byte(
293
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
294
- );
295
- os.write((char*)&flags_byte, sizeof(flags_byte));
296
- const uint16_t seed_hash = get_seed_hash();
297
- os.write((char*)&seed_hash, sizeof(seed_hash));
298
- os.write((char*)&num_keys_, sizeof(num_keys_));
299
- os.write((char*)&p_, sizeof(p_));
300
- os.write((char*)&(this->theta_), sizeof(uint64_t));
301
- os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
302
- }
303
-
304
- template<typename A>
305
- vector_u8<A> update_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
306
- const uint8_t preamble_longs = 3;
307
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
308
- vector_u8<A> bytes(size);
309
- uint8_t* ptr = bytes.data() + header_size_bytes;
310
-
311
- const uint8_t preamble_longs_and_rf = preamble_longs | (rf_ << 6);
312
- ptr += copy_to_mem(&preamble_longs_and_rf, ptr, sizeof(preamble_longs_and_rf));
313
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
314
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
315
- const uint8_t type = SKETCH_TYPE;
316
- ptr += copy_to_mem(&type, ptr, sizeof(type));
317
- ptr += copy_to_mem(&lg_nom_size_, ptr, sizeof(lg_nom_size_));
318
- ptr += copy_to_mem(&lg_cur_size_, ptr, sizeof(lg_cur_size_));
319
- const uint8_t flags_byte(
320
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
321
- );
322
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
323
- const uint16_t seed_hash = get_seed_hash();
324
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
325
- ptr += copy_to_mem(&num_keys_, ptr, sizeof(num_keys_));
326
- ptr += copy_to_mem(&p_, ptr, sizeof(p_));
327
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
328
- ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
329
-
330
- return bytes;
110
+ uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
111
+ return table_.theta_;
331
112
  }
332
113
 
333
114
  template<typename A>
334
- update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
335
- uint8_t preamble_longs;
336
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
337
- resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
338
- preamble_longs &= 0x3f; // remove resize factor
339
- uint8_t serial_version;
340
- is.read((char*)&serial_version, sizeof(serial_version));
341
- uint8_t type;
342
- is.read((char*)&type, sizeof(type));
343
- uint8_t lg_nom_size;
344
- is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
345
- uint8_t lg_cur_size;
346
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
347
- uint8_t flags_byte;
348
- is.read((char*)&flags_byte, sizeof(flags_byte));
349
- uint16_t seed_hash;
350
- is.read((char*)&seed_hash, sizeof(seed_hash));
351
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
352
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
353
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
354
- return internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed);
115
+ uint32_t update_theta_sketch_alloc<A>::get_num_retained() const {
116
+ return table_.num_entries_;
355
117
  }
356
118
 
357
119
  template<typename A>
358
- update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed) {
359
- uint32_t num_keys;
360
- is.read((char*)&num_keys, sizeof(num_keys));
361
- float p;
362
- is.read((char*)&p, sizeof(p));
363
- uint64_t theta;
364
- is.read((char*)&theta, sizeof(theta));
365
- vector_u64<A> keys(1 << lg_cur_size);
366
- is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
367
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
368
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
369
- return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
120
+ uint16_t update_theta_sketch_alloc<A>::get_seed_hash() const {
121
+ return compute_seed_hash(table_.seed_);
370
122
  }
371
123
 
372
124
  template<typename A>
373
- update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
374
- ensure_minimum_memory(size, 8);
375
- const char* ptr = static_cast<const char*>(bytes);
376
- uint8_t preamble_longs;
377
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
378
- resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
379
- preamble_longs &= 0x3f; // remove resize factor
380
- uint8_t serial_version;
381
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
382
- uint8_t type;
383
- ptr += copy_from_mem(ptr, &type, sizeof(type));
384
- uint8_t lg_nom_size;
385
- ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
386
- uint8_t lg_cur_size;
387
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
388
- uint8_t flags_byte;
389
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
390
- uint16_t seed_hash;
391
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
392
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
393
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
394
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
395
- return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed);
125
+ uint8_t update_theta_sketch_alloc<A>::get_lg_k() const {
126
+ return table_.lg_nom_size_;
396
127
  }
397
128
 
398
129
  template<typename A>
399
- update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed) {
400
- const uint32_t table_size = 1 << lg_cur_size;
401
- ensure_minimum_memory(size, 16 + sizeof(uint64_t) * table_size);
402
- const char* ptr = static_cast<const char*>(bytes);
403
- uint32_t num_keys;
404
- ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
405
- float p;
406
- ptr += copy_from_mem(ptr, &p, sizeof(p));
407
- uint64_t theta;
408
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
409
- vector_u64<A> keys(table_size);
410
- ptr += copy_from_mem(ptr, keys.data(), sizeof(uint64_t) * table_size);
411
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
412
- return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
413
- }
414
-
415
- template<typename A>
416
- void update_theta_sketch_alloc<A>::update(const std::string& value) {
417
- if (value.empty()) return;
418
- update(value.c_str(), value.length());
130
+ auto update_theta_sketch_alloc<A>::get_rf() const -> resize_factor {
131
+ return table_.rf_;
419
132
  }
420
133
 
421
134
  template<typename A>
@@ -460,19 +173,7 @@ void update_theta_sketch_alloc<A>::update(int8_t value) {
460
173
 
461
174
  template<typename A>
462
175
  void update_theta_sketch_alloc<A>::update(double value) {
463
- union {
464
- int64_t long_value;
465
- double double_value;
466
- } long_double_union;
467
-
468
- if (value == 0.0) {
469
- long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
470
- } else if (std::isnan(value)) {
471
- long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
472
- } else {
473
- long_double_union.double_value = value;
474
- }
475
- update(&long_double_union, sizeof(long_double_union));
176
+ update(canonical_double(value));
476
177
  }
477
178
 
478
179
  template<typename A>
@@ -481,157 +182,117 @@ void update_theta_sketch_alloc<A>::update(float value) {
481
182
  }
482
183
 
483
184
  template<typename A>
484
- void update_theta_sketch_alloc<A>::update(const void* data, unsigned length) {
485
- HashState hashes;
486
- MurmurHash3_x64_128(data, length, seed_, hashes);
487
- const uint64_t hash = hashes.h1 >> 1; // Java implementation does logical shift >>> to make values positive
488
- internal_update(hash);
489
- }
490
-
491
- template<typename A>
492
- compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered) const {
493
- return compact_theta_sketch_alloc<A>(*this, ordered);
185
+ void update_theta_sketch_alloc<A>::update(const std::string& value) {
186
+ if (value.empty()) return;
187
+ update(value.c_str(), value.length());
494
188
  }
495
189
 
496
190
  template<typename A>
497
- void update_theta_sketch_alloc<A>::internal_update(uint64_t hash) {
498
- this->is_empty_ = false;
499
- if (hash >= this->theta_ || hash == 0) return; // hash == 0 is reserved to mark empty slots in the table
500
- if (hash_search_or_insert(hash, keys_.data(), lg_cur_size_)) {
501
- num_keys_++;
502
- if (num_keys_ > capacity_) {
503
- if (lg_cur_size_ <= lg_nom_size_) {
504
- resize();
505
- } else {
506
- rebuild();
507
- }
508
- }
191
+ void update_theta_sketch_alloc<A>::update(const void* data, size_t length) {
192
+ const uint64_t hash = table_.hash_and_screen(data, length);
193
+ if (hash == 0) return;
194
+ auto result = table_.find(hash);
195
+ if (!result.second) {
196
+ table_.insert(result.first, hash);
509
197
  }
510
198
  }
511
199
 
512
200
  template<typename A>
513
201
  void update_theta_sketch_alloc<A>::trim() {
514
- if (num_keys_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
202
+ table_.trim();
515
203
  }
516
204
 
517
205
  template<typename A>
518
- void update_theta_sketch_alloc<A>::resize() {
519
- const uint8_t lg_tgt_size = lg_nom_size_ + 1;
520
- const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
521
- const uint8_t lg_new_size = lg_cur_size_ + factor;
522
- const uint32_t new_size = 1 << lg_new_size;
523
- vector_u64<A> new_keys(new_size, 0);
524
- for (uint32_t i = 0; i < keys_.size(); i++) {
525
- if (keys_[i] != 0) {
526
- hash_search_or_insert(keys_[i], new_keys.data(), lg_new_size); // TODO hash_insert
527
- }
528
- }
529
- keys_ = std::move(new_keys);
530
- lg_cur_size_ += factor;
531
- capacity_ = get_capacity(lg_cur_size_, lg_nom_size_);
532
- }
533
-
534
- template<typename A>
535
- void update_theta_sketch_alloc<A>::rebuild() {
536
- const uint32_t pivot = (1 << lg_nom_size_) + keys_.size() - num_keys_;
537
- std::nth_element(keys_.begin(), keys_.begin() + pivot, keys_.end());
538
- this->theta_ = keys_[pivot];
539
- vector_u64<A> new_keys(keys_.size(), 0);
540
- num_keys_ = 0;
541
- for (uint32_t i = 0; i < keys_.size(); i++) {
542
- if (keys_[i] != 0 && keys_[i] < this->theta_) {
543
- hash_search_or_insert(keys_[i], new_keys.data(), lg_cur_size_); // TODO hash_insert
544
- num_keys_++;
545
- }
546
- }
547
- keys_ = std::move(new_keys);
206
+ auto update_theta_sketch_alloc<A>::begin() -> iterator {
207
+ return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
548
208
  }
549
209
 
550
210
  template<typename A>
551
- uint32_t update_theta_sketch_alloc<A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
552
- const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
553
- return std::floor(fraction * (1 << lg_cur_size));
211
+ auto update_theta_sketch_alloc<A>::end() -> iterator {
212
+ return iterator(nullptr, 0, 1 << table_.lg_cur_size_);
554
213
  }
555
214
 
556
215
  template<typename A>
557
- uint32_t update_theta_sketch_alloc<A>::get_stride(uint64_t hash, uint8_t lg_size) {
558
- // odd and independent of index assuming lg_size lowest bits of the hash were used for the index
559
- return (2 * static_cast<uint32_t>((hash >> lg_size) & STRIDE_MASK)) + 1;
216
+ auto update_theta_sketch_alloc<A>::begin() const -> const_iterator {
217
+ return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
560
218
  }
561
219
 
562
220
  template<typename A>
563
- bool update_theta_sketch_alloc<A>::hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size) {
564
- const uint32_t mask = (1 << lg_size) - 1;
565
- const uint32_t stride = get_stride(hash, lg_size);
566
- uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
221
+ auto update_theta_sketch_alloc<A>::end() const -> const_iterator {
222
+ return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
223
+ }
567
224
 
568
- // search for duplicate or zero
569
- const uint32_t loop_index = cur_probe;
570
- do {
571
- const uint64_t value = table[cur_probe];
572
- if (value == 0) {
573
- table[cur_probe] = hash; // insert value
574
- return true;
575
- } else if (value == hash) {
576
- return false; // found a duplicate
577
- }
578
- cur_probe = (cur_probe + stride) & mask;
579
- } while (cur_probe != loop_index);
580
- throw std::logic_error("key not found and no empty slots!");
581
- }
582
-
583
- template<typename A>
584
- bool update_theta_sketch_alloc<A>::hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size) {
585
- const uint32_t mask = (1 << lg_size) - 1;
586
- const uint32_t stride = update_theta_sketch_alloc<A>::get_stride(hash, lg_size);
587
- uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
588
- const uint32_t loop_index = cur_probe;
589
- do {
590
- const uint64_t value = table[cur_probe];
591
- if (value == 0) {
592
- return false;
593
- } else if (value == hash) {
594
- return true;
595
- }
596
- cur_probe = (cur_probe + stride) & mask;
597
- } while (cur_probe != loop_index);
598
- throw std::logic_error("key not found and search wrapped");
225
+ template<typename A>
226
+ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered) const {
227
+ return compact_theta_sketch_alloc<A>(*this, ordered);
599
228
  }
600
229
 
601
230
  template<typename A>
602
- typename theta_sketch_alloc<A>::const_iterator update_theta_sketch_alloc<A>::begin() const {
603
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
231
+ void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
232
+ os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
233
+ os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
234
+ os << " resize factor : " << (1 << table_.rf_) << std::endl;
604
235
  }
605
236
 
237
+ // builder
238
+
239
+ template<typename A>
240
+ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
241
+
606
242
  template<typename A>
607
- typename theta_sketch_alloc<A>::const_iterator update_theta_sketch_alloc<A>::end() const {
608
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
243
+ update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
244
+ return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
609
245
  }
610
246
 
611
247
  // compact sketch
612
248
 
613
249
  template<typename A>
614
- compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered):
615
- theta_sketch_alloc<A>(is_empty, theta),
616
- keys_(std::move(keys)),
250
+ template<typename Other>
251
+ compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Other& other, bool ordered):
252
+ is_empty_(other.is_empty()),
253
+ is_ordered_(other.is_ordered() || ordered),
254
+ seed_hash_(other.get_seed_hash()),
255
+ theta_(other.get_theta64()),
256
+ entries_(other.get_allocator())
257
+ {
258
+ entries_.reserve(other.get_num_retained());
259
+ std::copy(other.begin(), other.end(), std::back_inserter(entries_));
260
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
261
+ }
262
+
263
+ template<typename A>
264
+ compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
265
+ std::vector<uint64_t, A>&& entries):
266
+ is_empty_(is_empty),
267
+ is_ordered_(is_ordered),
617
268
  seed_hash_(seed_hash),
618
- is_ordered_(is_ordered)
269
+ theta_(theta),
270
+ entries_(std::move(entries))
619
271
  {}
620
272
 
621
273
  template<typename A>
622
- compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered):
623
- theta_sketch_alloc<A>(other),
624
- keys_(other.get_num_retained()),
625
- seed_hash_(other.get_seed_hash()),
626
- is_ordered_(other.is_ordered() || ordered)
627
- {
628
- std::copy(other.begin(), other.end(), keys_.begin());
629
- if (ordered && !other.is_ordered()) std::sort(keys_.begin(), keys_.end());
274
+ A compact_theta_sketch_alloc<A>::get_allocator() const {
275
+ return entries_.get_allocator();
276
+ }
277
+
278
+ template<typename A>
279
+ bool compact_theta_sketch_alloc<A>::is_empty() const {
280
+ return is_empty_;
281
+ }
282
+
283
+ template<typename A>
284
+ bool compact_theta_sketch_alloc<A>::is_ordered() const {
285
+ return is_ordered_;
286
+ }
287
+
288
+ template<typename A>
289
+ uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
290
+ return theta_;
630
291
  }
631
292
 
632
293
  template<typename A>
633
294
  uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
634
- return keys_.size();
295
+ return static_cast<uint32_t>(entries_.size());
635
296
  }
636
297
 
637
298
  template<typename A>
@@ -640,298 +301,236 @@ uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
640
301
  }
641
302
 
642
303
  template<typename A>
643
- bool compact_theta_sketch_alloc<A>::is_ordered() const {
644
- return is_ordered_;
304
+ auto compact_theta_sketch_alloc<A>::begin() -> iterator {
305
+ return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
645
306
  }
646
307
 
647
308
  template<typename A>
648
- string<A> compact_theta_sketch_alloc<A>::to_string(bool print_items) const {
649
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
650
- os << "### Compact Theta sketch summary:" << std::endl;
651
- os << " num retained keys : " << keys_.size() << std::endl;
652
- os << " seed hash : " << this->get_seed_hash() << std::endl;
653
- os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
654
- os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
655
- os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
656
- os << " theta (fraction) : " << this->get_theta() << std::endl;
657
- os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
658
- os << " estimate : " << this->get_estimate() << std::endl;
659
- os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
660
- os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
661
- os << "### End sketch summary" << std::endl;
662
- if (print_items) {
663
- os << "### Retained keys" << std::endl;
664
- for (auto key: *this) os << " " << key << std::endl;
665
- os << "### End retained keys" << std::endl;
666
- }
667
- return os.str();
309
+ auto compact_theta_sketch_alloc<A>::end() -> iterator {
310
+ return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
311
+ }
312
+
313
+ template<typename A>
314
+ auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
315
+ return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
316
+ }
317
+
318
+ template<typename A>
319
+ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
320
+ return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
668
321
  }
669
322
 
323
+ template<typename A>
324
+ void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
325
+
670
326
  template<typename A>
671
327
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
672
- const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
328
+ const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
673
329
  const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
674
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
675
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
676
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
330
+ write(os, preamble_longs);
331
+ const uint8_t serial_version = SERIAL_VERSION;
332
+ write(os, serial_version);
677
333
  const uint8_t type = SKETCH_TYPE;
678
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
334
+ write(os, type);
679
335
  const uint16_t unused16 = 0;
680
- os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
336
+ write(os, unused16);
681
337
  const uint8_t flags_byte(
682
- (1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
683
- (1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
684
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
685
- (this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
338
+ (1 << flags::IS_COMPACT) |
339
+ (1 << flags::IS_READ_ONLY) |
340
+ (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
341
+ (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
686
342
  );
687
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
343
+ write(os, flags_byte);
688
344
  const uint16_t seed_hash = get_seed_hash();
689
- os.write((char*)&seed_hash, sizeof(seed_hash));
345
+ write(os, seed_hash);
690
346
  if (!this->is_empty()) {
691
347
  if (!is_single_item) {
692
- const uint32_t num_keys = keys_.size();
693
- os.write((char*)&num_keys, sizeof(num_keys));
348
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
349
+ write(os, num_entries);
694
350
  const uint32_t unused32 = 0;
695
- os.write((char*)&unused32, sizeof(unused32));
351
+ write(os, unused32);
696
352
  if (this->is_estimation_mode()) {
697
- os.write((char*)&(this->theta_), sizeof(uint64_t));
353
+ write(os, this->theta_);
698
354
  }
699
355
  }
700
- os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
356
+ write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
701
357
  }
702
358
  }
703
359
 
704
360
  template<typename A>
705
- vector_u8<A> compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
706
- const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
361
+ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
362
+ const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
707
363
  const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
708
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
709
- vector_u8<A> bytes(size);
364
+ const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
365
+ + sizeof(uint64_t) * entries_.size();
366
+ vector_bytes bytes(size, 0, entries_.get_allocator());
710
367
  uint8_t* ptr = bytes.data() + header_size_bytes;
711
368
 
712
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
713
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
714
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
369
+ ptr += copy_to_mem(preamble_longs, ptr);
370
+ const uint8_t serial_version = SERIAL_VERSION;
371
+ ptr += copy_to_mem(serial_version, ptr);
715
372
  const uint8_t type = SKETCH_TYPE;
716
- ptr += copy_to_mem(&type, ptr, sizeof(type));
717
- const uint16_t unused16 = 0;
718
- ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
373
+ ptr += copy_to_mem(type, ptr);
374
+ ptr += sizeof(uint16_t); // unused
719
375
  const uint8_t flags_byte(
720
- (1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
721
- (1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
722
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
723
- (this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
376
+ (1 << flags::IS_COMPACT) |
377
+ (1 << flags::IS_READ_ONLY) |
378
+ (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
379
+ (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
724
380
  );
725
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
381
+ ptr += copy_to_mem(flags_byte, ptr);
726
382
  const uint16_t seed_hash = get_seed_hash();
727
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
383
+ ptr += copy_to_mem(seed_hash, ptr);
728
384
  if (!this->is_empty()) {
729
385
  if (!is_single_item) {
730
- const uint32_t num_keys = keys_.size();
731
- ptr += copy_to_mem(&num_keys, ptr, sizeof(num_keys));
732
- const uint32_t unused32 = 0;
733
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
386
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
387
+ ptr += copy_to_mem(num_entries, ptr);
388
+ ptr += sizeof(uint32_t);
734
389
  if (this->is_estimation_mode()) {
735
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
390
+ ptr += copy_to_mem(theta_, ptr);
736
391
  }
737
392
  }
738
- ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
393
+ ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
739
394
  }
740
-
741
395
  return bytes;
742
396
  }
743
397
 
744
398
  template<typename A>
745
- compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
746
- uint8_t preamble_longs;
747
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
748
- uint8_t serial_version;
749
- is.read((char*)&serial_version, sizeof(serial_version));
750
- uint8_t type;
751
- is.read((char*)&type, sizeof(type));
752
- uint16_t unused16;
753
- is.read((char*)&unused16, sizeof(unused16));
754
- uint8_t flags_byte;
755
- is.read((char*)&flags_byte, sizeof(flags_byte));
756
- uint16_t seed_hash;
757
- is.read((char*)&seed_hash, sizeof(seed_hash));
758
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
759
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
760
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
761
- if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
762
- return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
763
- }
764
-
765
- template<typename A>
766
- compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
767
- uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
768
- uint32_t num_keys = 0;
399
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
400
+ const auto preamble_longs = read<uint8_t>(is);
401
+ const auto serial_version = read<uint8_t>(is);
402
+ const auto type = read<uint8_t>(is);
403
+ read<uint16_t>(is); // unused
404
+ const auto flags_byte = read<uint8_t>(is);
405
+ const auto seed_hash = read<uint16_t>(is);
406
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
407
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
408
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
409
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
769
410
 
770
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
411
+ uint64_t theta = theta_constants::MAX_THETA;
412
+ uint32_t num_entries = 0;
771
413
  if (!is_empty) {
772
414
  if (preamble_longs == 1) {
773
- num_keys = 1;
415
+ num_entries = 1;
774
416
  } else {
775
- is.read((char*)&num_keys, sizeof(num_keys));
776
- uint32_t unused32;
777
- is.read((char*)&unused32, sizeof(unused32));
417
+ num_entries = read<uint32_t>(is);
418
+ read<uint32_t>(is); // unused
778
419
  if (preamble_longs > 2) {
779
- is.read((char*)&theta, sizeof(theta));
420
+ theta = read<uint64_t>(is);
780
421
  }
781
422
  }
782
423
  }
783
- vector_u64<A> keys(num_keys);
784
- if (!is_empty) is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
424
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
425
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
785
426
 
786
- const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
787
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
788
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
427
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
428
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
429
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
789
430
  }
790
431
 
791
432
  template<typename A>
792
- compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
433
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
793
434
  ensure_minimum_memory(size, 8);
794
435
  const char* ptr = static_cast<const char*>(bytes);
436
+ const char* base = ptr;
795
437
  uint8_t preamble_longs;
796
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
438
+ ptr += copy_from_mem(ptr, preamble_longs);
797
439
  uint8_t serial_version;
798
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
440
+ ptr += copy_from_mem(ptr, serial_version);
799
441
  uint8_t type;
800
- ptr += copy_from_mem(ptr, &type, sizeof(type));
801
- uint16_t unused16;
802
- ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
442
+ ptr += copy_from_mem(ptr, type);
443
+ ptr += sizeof(uint16_t); // unused
803
444
  uint8_t flags_byte;
804
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
445
+ ptr += copy_from_mem(ptr, flags_byte);
805
446
  uint16_t seed_hash;
806
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
807
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
808
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
809
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
810
- if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
811
- return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
812
- }
813
-
814
- template<typename A>
815
- compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
816
- const char* ptr = static_cast<const char*>(bytes);
817
- const char* base = ptr;
818
-
819
- uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
820
- uint32_t num_keys = 0;
821
-
822
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
447
+ ptr += copy_from_mem(ptr, seed_hash);
448
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
449
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
450
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
451
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
452
+
453
+ uint64_t theta = theta_constants::MAX_THETA;
454
+ uint32_t num_entries = 0;
823
455
  if (!is_empty) {
824
456
  if (preamble_longs == 1) {
825
- num_keys = 1;
457
+ num_entries = 1;
826
458
  } else {
827
459
  ensure_minimum_memory(size, 8); // read the first prelong before this method
828
- ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
829
- uint32_t unused32;
830
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
460
+ ptr += copy_from_mem(ptr, num_entries);
461
+ ptr += sizeof(uint32_t); // unused
831
462
  if (preamble_longs > 2) {
832
463
  ensure_minimum_memory(size, (preamble_longs - 1) << 3);
833
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
464
+ ptr += copy_from_mem(ptr, theta);
834
465
  }
835
466
  }
836
467
  }
837
- const size_t keys_size_bytes = sizeof(uint64_t) * num_keys;
838
- check_memory_size(ptr - base + keys_size_bytes, size);
839
- vector_u64<A> keys(num_keys);
840
- if (!is_empty) ptr += copy_from_mem(ptr, keys.data(), keys_size_bytes);
841
-
842
- const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
843
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
844
- }
845
-
846
- template<typename A>
847
- typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::begin() const {
848
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
849
- }
468
+ const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
469
+ check_memory_size(ptr - base + entries_size_bytes, size);
470
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
471
+ if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
850
472
 
851
- template<typename A>
852
- typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::end() const {
853
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
473
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
474
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
854
475
  }
855
476
 
856
- // builder
857
-
858
- template<typename A>
859
- update_theta_sketch_alloc<A>::builder::builder():
860
- lg_k_(DEFAULT_LG_K), rf_(DEFAULT_RESIZE_FACTOR), p_(1), seed_(DEFAULT_SEED) {}
477
+ // wrapped compact sketch
861
478
 
862
479
  template<typename A>
863
- typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
864
- if (lg_k < MIN_LG_K) {
865
- throw std::invalid_argument("lg_k must not be less than " + std::to_string(MIN_LG_K) + ": " + std::to_string(lg_k));
866
- }
867
- lg_k_ = lg_k;
868
- return *this;
869
- }
870
-
871
- template<typename A>
872
- typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_resize_factor(resize_factor rf) {
873
- rf_ = rf;
874
- return *this;
875
- }
876
-
877
- template<typename A>
878
- typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_p(float p) {
879
- p_ = p;
880
- return *this;
881
- }
480
+ wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
481
+ uint64_t theta, const uint64_t* entries):
482
+ is_empty_(is_empty),
483
+ is_ordered_(is_ordered),
484
+ seed_hash_(seed_hash),
485
+ num_entries_(num_entries),
486
+ theta_(theta),
487
+ entries_(entries)
488
+ {}
882
489
 
883
490
  template<typename A>
884
- typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_seed(uint64_t seed) {
885
- seed_ = seed;
886
- return *this;
491
+ const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
492
+ auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
493
+ return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
887
494
  }
888
495
 
889
496
  template<typename A>
890
- uint8_t update_theta_sketch_alloc<A>::builder::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
891
- return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
497
+ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
498
+ return A();
892
499
  }
893
500
 
894
501
  template<typename A>
895
- update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
896
- return update_theta_sketch_alloc<A>(starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_)), lg_k_, rf_, p_, seed_);
502
+ bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
503
+ return is_empty_;
897
504
  }
898
505
 
899
- // iterator
900
-
901
506
  template<typename A>
902
- theta_sketch_alloc<A>::const_iterator::const_iterator(const uint64_t* keys, uint32_t size, uint32_t index):
903
- keys_(keys), size_(size), index_(index) {
904
- while (index_ < size_ && keys_[index_] == 0) ++index_;
507
+ bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
508
+ return is_ordered_;
905
509
  }
906
510
 
907
511
  template<typename A>
908
- typename theta_sketch_alloc<A>::const_iterator& theta_sketch_alloc<A>::const_iterator::operator++() {
909
- do {
910
- ++index_;
911
- } while (index_ < size_ && keys_[index_] == 0);
912
- return *this;
512
+ uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
513
+ return theta_;
913
514
  }
914
515
 
915
516
  template<typename A>
916
- typename theta_sketch_alloc<A>::const_iterator theta_sketch_alloc<A>::const_iterator::operator++(int) {
917
- const_iterator tmp(*this);
918
- operator++();
919
- return tmp;
517
+ uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
518
+ return static_cast<uint32_t>(num_entries_);
920
519
  }
921
520
 
922
521
  template<typename A>
923
- bool theta_sketch_alloc<A>::const_iterator::operator==(const const_iterator& other) const {
924
- return index_ == other.index_;
522
+ uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
523
+ return seed_hash_;
925
524
  }
926
525
 
927
526
  template<typename A>
928
- bool theta_sketch_alloc<A>::const_iterator::operator!=(const const_iterator& other) const {
929
- return index_ != other.index_;
527
+ auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
528
+ return entries_;
930
529
  }
931
530
 
932
531
  template<typename A>
933
- uint64_t theta_sketch_alloc<A>::const_iterator::operator*() const {
934
- return keys_[index_];
532
+ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
533
+ return entries_ + num_entries_;
935
534
  }
936
535
 
937
536
  } /* namespace datasketches */