datasketches 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -21,8 +21,8 @@ namespace datasketches {
21
21
 
22
22
  template<typename A>
23
23
  update_array_of_doubles_sketch_alloc<A>::update_array_of_doubles_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
24
- uint64_t theta, uint64_t seed, const array_of_doubles_update_policy<A>& policy, const A& allocator):
25
- Base(lg_cur_size, lg_nom_size, rf, theta, seed, policy, allocator) {}
24
+ float p, uint64_t theta, uint64_t seed, const array_of_doubles_update_policy<A>& policy, const A& allocator):
25
+ Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {}
26
26
 
27
27
 
28
28
  template<typename A>
@@ -43,7 +43,7 @@ tuple_base_builder<builder, array_of_doubles_update_policy<A>, A>(policy, alloca
43
43
 
44
44
  template<typename A>
45
45
  update_array_of_doubles_sketch_alloc<A> update_array_of_doubles_sketch_alloc<A>::builder::build() const {
46
- return update_array_of_doubles_sketch_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
46
+ return update_array_of_doubles_sketch_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
47
47
  }
48
48
 
49
49
  // compact sketch
@@ -70,33 +70,33 @@ uint8_t compact_array_of_doubles_sketch_alloc<A>::get_num_values() const {
70
70
  template<typename A>
71
71
  void compact_array_of_doubles_sketch_alloc<A>::serialize(std::ostream& os) const {
72
72
  const uint8_t preamble_longs = 1;
73
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
73
+ write(os, preamble_longs);
74
74
  const uint8_t serial_version = SERIAL_VERSION;
75
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
75
+ write(os, serial_version);
76
76
  const uint8_t family = SKETCH_FAMILY;
77
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
77
+ write(os, family);
78
78
  const uint8_t type = SKETCH_TYPE;
79
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
79
+ write(os, type);
80
80
  const uint8_t flags_byte(
81
81
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
82
82
  (this->get_num_retained() > 0 ? 1 << flags::HAS_ENTRIES : 0) |
83
83
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
84
84
  );
85
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
86
- os.write(reinterpret_cast<const char*>(&num_values_), sizeof(num_values_));
85
+ write(os, flags_byte);
86
+ write(os, num_values_);
87
87
  const uint16_t seed_hash = this->get_seed_hash();
88
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
89
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
88
+ write(os, seed_hash);
89
+ write(os, this->theta_);
90
90
  if (this->get_num_retained() > 0) {
91
- const uint32_t num_entries = this->entries_.size();
92
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
91
+ const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
92
+ write(os, num_entries);
93
93
  const uint32_t unused32 = 0;
94
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
94
+ write(os, unused32);
95
95
  for (const auto& it: this->entries_) {
96
- os.write(reinterpret_cast<const char*>(&it.first), sizeof(uint64_t));
96
+ write(os, it.first);
97
97
  }
98
98
  for (const auto& it: this->entries_) {
99
- os.write(reinterpret_cast<const char*>(it.second.data()), it.second.size() * sizeof(double));
99
+ write(os, it.second.data(), it.second.size() * sizeof(double));
100
100
  }
101
101
  }
102
102
  }
@@ -110,30 +110,29 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
110
110
  vector_bytes bytes(size, 0, this->entries_.get_allocator());
111
111
  uint8_t* ptr = bytes.data() + header_size_bytes;
112
112
 
113
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
113
+ ptr += copy_to_mem(preamble_longs, ptr);
114
114
  const uint8_t serial_version = SERIAL_VERSION;
115
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
115
+ ptr += copy_to_mem(serial_version, ptr);
116
116
  const uint8_t family = SKETCH_FAMILY;
117
- ptr += copy_to_mem(&family, ptr, sizeof(family));
117
+ ptr += copy_to_mem(family, ptr);
118
118
  const uint8_t type = SKETCH_TYPE;
119
- ptr += copy_to_mem(&type, ptr, sizeof(type));
119
+ ptr += copy_to_mem(type, ptr);
120
120
  const uint8_t flags_byte(
121
121
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
122
122
  (this->get_num_retained() ? 1 << flags::HAS_ENTRIES : 0) |
123
123
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
124
124
  );
125
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
126
- ptr += copy_to_mem(&num_values_, ptr, sizeof(num_values_));
125
+ ptr += copy_to_mem(flags_byte, ptr);
126
+ ptr += copy_to_mem(num_values_, ptr);
127
127
  const uint16_t seed_hash = this->get_seed_hash();
128
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
129
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
128
+ ptr += copy_to_mem(seed_hash, ptr);
129
+ ptr += copy_to_mem((this->theta_), ptr);
130
130
  if (this->get_num_retained() > 0) {
131
- const uint32_t num_entries = this->entries_.size();
132
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
133
- const uint32_t unused32 = 0;
134
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
131
+ const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
132
+ ptr += copy_to_mem(num_entries, ptr);
133
+ ptr += sizeof(uint32_t); // unused
135
134
  for (const auto& it: this->entries_) {
136
- ptr += copy_to_mem(&it.first, ptr, sizeof(uint64_t));
135
+ ptr += copy_to_mem(it.first, ptr);
137
136
  }
138
137
  for (const auto& it: this->entries_) {
139
138
  ptr += copy_to_mem(it.second.data(), ptr, it.second.size() * sizeof(double));
@@ -144,40 +143,30 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
144
143
 
145
144
  template<typename A>
146
145
  compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
147
- uint8_t preamble_longs;
148
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
149
- uint8_t serial_version;
150
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
151
- uint8_t family;
152
- is.read(reinterpret_cast<char*>(&family), sizeof(family));
153
- uint8_t type;
154
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
155
- uint8_t flags_byte;
156
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
157
- uint8_t num_values;
158
- is.read(reinterpret_cast<char*>(&num_values), sizeof(num_values));
159
- uint16_t seed_hash;
160
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
146
+ read<uint8_t>(is); // unused
147
+ const auto serial_version = read<uint8_t>(is);
148
+ const auto family = read<uint8_t>(is);
149
+ const auto type = read<uint8_t>(is);
150
+ const auto flags_byte = read<uint8_t>(is);
151
+ const auto num_values = read<uint8_t>(is);
152
+ const auto seed_hash = read<uint16_t>(is);
161
153
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
162
154
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
163
155
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
164
156
  const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
165
157
  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
166
158
 
167
- uint64_t theta;
168
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
159
+ const auto theta = read<uint64_t>(is);
169
160
  std::vector<Entry, AllocEntry> entries(allocator);
170
161
  if (has_entries) {
171
- uint32_t num_entries;
172
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
173
- uint32_t unused32;
174
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
162
+ const auto num_entries = read<uint32_t>(is);
163
+ read<uint32_t>(is); // unused
175
164
  entries.reserve(num_entries);
176
165
  std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
177
- is.read(reinterpret_cast<char*>(keys.data()), num_entries * sizeof(uint64_t));
166
+ read(is, keys.data(), num_entries * sizeof(uint64_t));
178
167
  for (size_t i = 0; i < num_entries; ++i) {
179
168
  aod<A> summary(num_values, allocator);
180
- is.read(reinterpret_cast<char*>(summary.data()), num_values * sizeof(double));
169
+ read(is, summary.data(), num_values * sizeof(double));
181
170
  entries.push_back(Entry(keys[i], std::move(summary)));
182
171
  }
183
172
  }
@@ -191,20 +180,19 @@ template<typename A>
191
180
  compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
192
181
  ensure_minimum_memory(size, 16);
193
182
  const char* ptr = static_cast<const char*>(bytes);
194
- uint8_t preamble_longs;
195
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
183
+ ptr += sizeof(uint8_t); // unused
196
184
  uint8_t serial_version;
197
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
185
+ ptr += copy_from_mem(ptr, serial_version);
198
186
  uint8_t family;
199
- ptr += copy_from_mem(ptr, &family, sizeof(family));
187
+ ptr += copy_from_mem(ptr, family);
200
188
  uint8_t type;
201
- ptr += copy_from_mem(ptr, &type, sizeof(type));
189
+ ptr += copy_from_mem(ptr, type);
202
190
  uint8_t flags_byte;
203
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
191
+ ptr += copy_from_mem(ptr, flags_byte);
204
192
  uint8_t num_values;
205
- ptr += copy_from_mem(ptr, &num_values, sizeof(num_values));
193
+ ptr += copy_from_mem(ptr, num_values);
206
194
  uint16_t seed_hash;
207
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
195
+ ptr += copy_from_mem(ptr, seed_hash);
208
196
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
209
197
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
210
198
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
@@ -212,14 +200,13 @@ compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A
212
200
  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
213
201
 
214
202
  uint64_t theta;
215
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
203
+ ptr += copy_from_mem(ptr, theta);
216
204
  std::vector<Entry, AllocEntry> entries(allocator);
217
205
  if (has_entries) {
218
206
  ensure_minimum_memory(size, 24);
219
207
  uint32_t num_entries;
220
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
221
- uint32_t unused32;
222
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
208
+ ptr += copy_from_mem(ptr, num_entries);
209
+ ptr += sizeof(uint32_t); // unused
223
210
  ensure_minimum_memory(size, 24 + (sizeof(uint64_t) + sizeof(double) * num_values) * num_entries);
224
211
  entries.reserve(num_entries);
225
212
  std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
@@ -61,7 +61,7 @@ public:
61
61
 
62
62
  private:
63
63
  // for builder
64
- array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
64
+ array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
65
65
  };
66
66
 
67
67
  template<typename Allocator>
@@ -20,8 +20,8 @@
20
20
  namespace datasketches {
21
21
 
22
22
  template<typename A>
23
- array_of_doubles_union_alloc<A>::array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const A& allocator):
24
- Base(lg_cur_size, lg_nom_size, rf, theta, seed, policy, allocator)
23
+ array_of_doubles_union_alloc<A>::array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const A& allocator):
24
+ Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator)
25
25
  {}
26
26
 
27
27
  template<typename A>
@@ -37,7 +37,7 @@ tuple_base_builder<builder, Policy, A>(policy, allocator) {}
37
37
 
38
38
  template<typename A>
39
39
  array_of_doubles_union_alloc<A> array_of_doubles_union_alloc<A>::builder::build() const {
40
- return array_of_doubles_union_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
40
+ return array_of_doubles_union_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
41
41
  }
42
42
 
43
43
  } /* namespace datasketches */
@@ -153,8 +153,7 @@ public:
153
153
  virtual const_iterator end() const = 0;
154
154
 
155
155
  protected:
156
- using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
157
- virtual void print_specifics(ostrstream& os) const = 0;
156
+ virtual void print_specifics(std::ostringstream& os) const = 0;
158
157
 
159
158
  static uint16_t get_seed_hash(uint64_t seed);
160
159
 
@@ -325,6 +324,11 @@ public:
325
324
  */
326
325
  void trim();
327
326
 
327
+ /**
328
+ * Reset the sketch to the initial empty state
329
+ */
330
+ void reset();
331
+
328
332
  /**
329
333
  * Converts this sketch to a compact sketch (ordered or unordered).
330
334
  * @param ordered optional flag to specify if ordered sketch should be produced
@@ -342,10 +346,9 @@ protected:
342
346
  tuple_map map_;
343
347
 
344
348
  // for builder
345
- update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
349
+ update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
346
350
 
347
- using ostrstream = typename Base::ostrstream;
348
- virtual void print_specifics(ostrstream& os) const;
351
+ virtual void print_specifics(std::ostringstream& os) const;
349
352
  };
350
353
 
351
354
  // compact sketch
@@ -367,9 +370,11 @@ public:
367
370
  using vector_bytes = std::vector<uint8_t, AllocBytes>;
368
371
  using comparator = compare_by_key<ExtractKey>;
369
372
 
370
- static const uint8_t SERIAL_VERSION = 1;
373
+ static const uint8_t SERIAL_VERSION_LEGACY = 1;
374
+ static const uint8_t SERIAL_VERSION = 3;
371
375
  static const uint8_t SKETCH_FAMILY = 9;
372
- static const uint8_t SKETCH_TYPE = 5;
376
+ static const uint8_t SKETCH_TYPE = 1;
377
+ static const uint8_t SKETCH_TYPE_LEGACY = 5;
373
378
  enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
374
379
 
375
380
  // Instances of this type can be obtained:
@@ -473,8 +478,7 @@ protected:
473
478
  bool destroy_;
474
479
  };
475
480
 
476
- using ostrstream = typename Base::ostrstream;
477
- virtual void print_specifics(ostrstream& os) const;
481
+ virtual void print_specifics(std::ostringstream& os) const;
478
482
 
479
483
  };
480
484
 
@@ -53,7 +53,9 @@ double tuple_sketch<S, A>::get_upper_bound(uint8_t num_std_devs) const {
53
53
 
54
54
  template<typename S, typename A>
55
55
  string<A> tuple_sketch<S, A>::to_string(bool detail) const {
56
- ostrstream os;
56
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
57
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
58
+ std::ostringstream os;
57
59
  os << "### Tuple sketch summary:" << std::endl;
58
60
  os << " num retained entries : " << get_num_retained() << std::endl;
59
61
  os << " seed hash : " << get_seed_hash() << std::endl;
@@ -74,15 +76,15 @@ string<A> tuple_sketch<S, A>::to_string(bool detail) const {
74
76
  }
75
77
  os << "### End retained entries" << std::endl;
76
78
  }
77
- return os.str();
79
+ return string<A>(os.str().c_str(), get_allocator());
78
80
  }
79
81
 
80
82
  // update sketch
81
83
 
82
84
  template<typename S, typename U, typename P, typename A>
83
- update_tuple_sketch<S, U, P, A>::update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
85
+ update_tuple_sketch<S, U, P, A>::update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
84
86
  policy_(policy),
85
- map_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
87
+ map_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
86
88
  {}
87
89
 
88
90
  template<typename S, typename U, typename P, typename A>
@@ -97,12 +99,12 @@ bool update_tuple_sketch<S, U, P, A>::is_empty() const {
97
99
 
98
100
  template<typename S, typename U, typename P, typename A>
99
101
  bool update_tuple_sketch<S, U, P, A>::is_ordered() const {
100
- return false;
102
+ return map_.num_entries_ > 1 ? false : true;;
101
103
  }
102
104
 
103
105
  template<typename S, typename U, typename P, typename A>
104
106
  uint64_t update_tuple_sketch<S, U, P, A>::get_theta64() const {
105
- return map_.theta_;
107
+ return is_empty() ? theta_constants::MAX_THETA : map_.theta_;
106
108
  }
107
109
 
108
110
  template<typename S, typename U, typename P, typename A>
@@ -212,6 +214,11 @@ void update_tuple_sketch<S, U, P, A>::trim() {
212
214
  map_.trim();
213
215
  }
214
216
 
217
+ template<typename S, typename U, typename P, typename A>
218
+ void update_tuple_sketch<S, U, P, A>::reset() {
219
+ map_.reset();
220
+ }
221
+
215
222
  template<typename S, typename U, typename P, typename A>
216
223
  auto update_tuple_sketch<S, U, P, A>::begin() -> iterator {
217
224
  return iterator(map_.entries_, 1 << map_.lg_cur_size_, 0);
@@ -238,7 +245,7 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered
238
245
  }
239
246
 
240
247
  template<typename S, typename U, typename P, typename A>
241
- void update_tuple_sketch<S, U, P, A>::print_specifics(ostrstream& os) const {
248
+ void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
242
249
  os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
243
250
  os << " lg current size : " << (int) map_.lg_cur_size_ << std::endl;
244
251
  os << " resize factor : " << (1 << map_.rf_) << std::endl;
@@ -250,7 +257,7 @@ template<typename S, typename A>
250
257
  compact_tuple_sketch<S, A>::compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
251
258
  std::vector<Entry, AllocEntry>&& entries):
252
259
  is_empty_(is_empty),
253
- is_ordered_(is_ordered),
260
+ is_ordered_(is_ordered || (entries.size() <= 1ULL)),
254
261
  seed_hash_(seed_hash),
255
262
  theta_(theta),
256
263
  entries_(std::move(entries))
@@ -315,7 +322,7 @@ uint64_t compact_tuple_sketch<S, A>::get_theta64() const {
315
322
 
316
323
  template<typename S, typename A>
317
324
  uint32_t compact_tuple_sketch<S, A>::get_num_retained() const {
318
- return entries_.size();
325
+ return static_cast<uint32_t>(entries_.size());
319
326
  }
320
327
 
321
328
  template<typename S, typename A>
@@ -347,36 +354,36 @@ template<typename SerDe>
347
354
  void compact_tuple_sketch<S, A>::serialize(std::ostream& os, const SerDe& sd) const {
348
355
  const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
349
356
  const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
350
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
357
+ write(os, preamble_longs);
351
358
  const uint8_t serial_version = SERIAL_VERSION;
352
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
359
+ write(os, serial_version);
353
360
  const uint8_t family = SKETCH_FAMILY;
354
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
361
+ write(os, family);
355
362
  const uint8_t type = SKETCH_TYPE;
356
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
363
+ write(os, type);
357
364
  const uint8_t unused8 = 0;
358
- os.write(reinterpret_cast<const char*>(&unused8), sizeof(unused8));
365
+ write(os, unused8);
359
366
  const uint8_t flags_byte(
360
367
  (1 << flags::IS_COMPACT) |
361
368
  (1 << flags::IS_READ_ONLY) |
362
369
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
363
370
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
364
371
  );
365
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
372
+ write(os, flags_byte);
366
373
  const uint16_t seed_hash = get_seed_hash();
367
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
374
+ write(os, seed_hash);
368
375
  if (!this->is_empty()) {
369
376
  if (!is_single_item) {
370
- const uint32_t num_entries = entries_.size();
371
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
377
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
378
+ write(os, num_entries);
372
379
  const uint32_t unused32 = 0;
373
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
380
+ write(os, unused32);
374
381
  if (this->is_estimation_mode()) {
375
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
382
+ write(os, this->theta_);
376
383
  }
377
384
  }
378
385
  for (const auto& it: entries_) {
379
- os.write(reinterpret_cast<const char*>(&it.first), sizeof(uint64_t));
386
+ write(os, it.first);
380
387
  sd.serialize(os, &it.second, 1);
381
388
  }
382
389
  }
@@ -393,36 +400,34 @@ auto compact_tuple_sketch<S, A>::serialize(unsigned header_size_bytes, const Ser
393
400
  uint8_t* ptr = bytes.data() + header_size_bytes;
394
401
  const uint8_t* end_ptr = ptr + size;
395
402
 
396
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
403
+ ptr += copy_to_mem(preamble_longs, ptr);
397
404
  const uint8_t serial_version = SERIAL_VERSION;
398
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
405
+ ptr += copy_to_mem(serial_version, ptr);
399
406
  const uint8_t family = SKETCH_FAMILY;
400
- ptr += copy_to_mem(&family, ptr, sizeof(family));
407
+ ptr += copy_to_mem(family, ptr);
401
408
  const uint8_t type = SKETCH_TYPE;
402
- ptr += copy_to_mem(&type, ptr, sizeof(type));
403
- const uint8_t unused8 = 0;
404
- ptr += copy_to_mem(&unused8, ptr, sizeof(unused8));
409
+ ptr += copy_to_mem(type, ptr);
410
+ ptr += sizeof(uint8_t); // unused
405
411
  const uint8_t flags_byte(
406
412
  (1 << flags::IS_COMPACT) |
407
413
  (1 << flags::IS_READ_ONLY) |
408
414
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
409
415
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
410
416
  );
411
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
417
+ ptr += copy_to_mem(flags_byte, ptr);
412
418
  const uint16_t seed_hash = get_seed_hash();
413
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
419
+ ptr += copy_to_mem(seed_hash, ptr);
414
420
  if (!this->is_empty()) {
415
421
  if (!is_single_item) {
416
- const uint32_t num_entries = entries_.size();
417
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
418
- const uint32_t unused32 = 0;
419
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
422
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
423
+ ptr += copy_to_mem(num_entries, ptr);
424
+ ptr += sizeof(uint32_t); // unused
420
425
  if (this->is_estimation_mode()) {
421
- ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
426
+ ptr += copy_to_mem(theta_, ptr);
422
427
  }
423
428
  }
424
429
  for (const auto& it: entries_) {
425
- ptr += copy_to_mem(&it.first, ptr, sizeof(uint64_t));
430
+ ptr += copy_to_mem(it.first, ptr);
426
431
  ptr += sd.serialize(ptr, end_ptr - ptr, &it.second, 1);
427
432
  }
428
433
  }
@@ -432,23 +437,22 @@ auto compact_tuple_sketch<S, A>::serialize(unsigned header_size_bytes, const Ser
432
437
  template<typename S, typename A>
433
438
  template<typename SerDe>
434
439
  compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(std::istream& is, uint64_t seed, const SerDe& sd, const A& allocator) {
435
- uint8_t preamble_longs;
436
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
437
- uint8_t serial_version;
438
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
439
- uint8_t family;
440
- is.read(reinterpret_cast<char*>(&family), sizeof(family));
441
- uint8_t type;
442
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
443
- uint8_t unused8;
444
- is.read(reinterpret_cast<char*>(&unused8), sizeof(unused8));
445
- uint8_t flags_byte;
446
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
447
- uint16_t seed_hash;
448
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
449
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
440
+ const auto preamble_longs = read<uint8_t>(is);
441
+ const auto serial_version = read<uint8_t>(is);
442
+ const auto family = read<uint8_t>(is);
443
+ const auto type = read<uint8_t>(is);
444
+ read<uint8_t>(is); // unused
445
+ const auto flags_byte = read<uint8_t>(is);
446
+ const auto seed_hash = read<uint16_t>(is);
447
+ if (serial_version != SERIAL_VERSION && serial_version != SERIAL_VERSION_LEGACY) {
448
+ throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + " or "
449
+ + std::to_string(SERIAL_VERSION_LEGACY) + ", actual " + std::to_string(serial_version));
450
+ }
450
451
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
451
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
452
+ if (type != SKETCH_TYPE && type != SKETCH_TYPE_LEGACY) {
453
+ throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + " or "
454
+ + std::to_string(SKETCH_TYPE_LEGACY) + ", actual " + std::to_string(type));
455
+ }
452
456
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
453
457
  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
454
458
 
@@ -458,11 +462,10 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(std::istream&
458
462
  if (preamble_longs == 1) {
459
463
  num_entries = 1;
460
464
  } else {
461
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
462
- uint32_t unused32;
463
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
465
+ num_entries = read<uint32_t>(is);
466
+ read<uint32_t>(is); // unused
464
467
  if (preamble_longs > 2) {
465
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
468
+ theta = read<uint64_t>(is);
466
469
  }
467
470
  }
468
471
  }
@@ -472,8 +475,7 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(std::istream&
472
475
  entries.reserve(num_entries);
473
476
  std::unique_ptr<S, deleter_of_summaries> summary(alloc.allocate(1), deleter_of_summaries(1, false, allocator));
474
477
  for (size_t i = 0; i < num_entries; ++i) {
475
- uint64_t key;
476
- is.read(reinterpret_cast<char*>(&key), sizeof(uint64_t));
478
+ const auto key = read<uint64_t>(is);
477
479
  sd.deserialize(is, summary.get(), 1);
478
480
  entries.push_back(Entry(key, std::move(*summary)));
479
481
  (*summary).~S();
@@ -491,22 +493,27 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
491
493
  const char* ptr = static_cast<const char*>(bytes);
492
494
  const char* base = ptr;
493
495
  uint8_t preamble_longs;
494
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
496
+ ptr += copy_from_mem(ptr, preamble_longs);
495
497
  uint8_t serial_version;
496
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
498
+ ptr += copy_from_mem(ptr, serial_version);
497
499
  uint8_t family;
498
- ptr += copy_from_mem(ptr, &family, sizeof(family));
500
+ ptr += copy_from_mem(ptr, family);
499
501
  uint8_t type;
500
- ptr += copy_from_mem(ptr, &type, sizeof(type));
501
- uint8_t unused8;
502
- ptr += copy_from_mem(ptr, &unused8, sizeof(unused8));
502
+ ptr += copy_from_mem(ptr, type);
503
+ ptr += sizeof(uint8_t); // unused
503
504
  uint8_t flags_byte;
504
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
505
+ ptr += copy_from_mem(ptr, flags_byte);
505
506
  uint16_t seed_hash;
506
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
507
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
507
+ ptr += copy_from_mem(ptr, seed_hash);
508
+ if (serial_version != SERIAL_VERSION && serial_version != SERIAL_VERSION_LEGACY) {
509
+ throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + " or "
510
+ + std::to_string(SERIAL_VERSION_LEGACY) + ", actual " + std::to_string(serial_version));
511
+ }
508
512
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
509
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
513
+ if (type != SKETCH_TYPE && type != SKETCH_TYPE_LEGACY) {
514
+ throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + " or "
515
+ + std::to_string(SKETCH_TYPE_LEGACY) + ", actual " + std::to_string(type));
516
+ }
510
517
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
511
518
  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
512
519
 
@@ -518,12 +525,11 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
518
525
  num_entries = 1;
519
526
  } else {
520
527
  ensure_minimum_memory(size, 8); // read the first prelong before this method
521
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
522
- uint32_t unused32;
523
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
528
+ ptr += copy_from_mem(ptr, num_entries);
529
+ ptr += sizeof(uint32_t); // unused
524
530
  if (preamble_longs > 2) {
525
531
  ensure_minimum_memory(size, (preamble_longs - 1) << 3);
526
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
532
+ ptr += copy_from_mem(ptr, theta);
527
533
  }
528
534
  }
529
535
  }
@@ -536,7 +542,7 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
536
542
  std::unique_ptr<S, deleter_of_summaries> summary(alloc.allocate(1), deleter_of_summaries(1, false, allocator));
537
543
  for (size_t i = 0; i < num_entries; ++i) {
538
544
  uint64_t key;
539
- ptr += copy_from_mem(ptr, &key, sizeof(key));
545
+ ptr += copy_from_mem(ptr, key);
540
546
  ptr += sd.deserialize(ptr, base + size - ptr, summary.get(), 1);
541
547
  entries.push_back(Entry(key, std::move(*summary)));
542
548
  (*summary).~S();
@@ -548,26 +554,26 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
548
554
 
549
555
  template<typename S, typename A>
550
556
  auto compact_tuple_sketch<S, A>::begin() -> iterator {
551
- return iterator(entries_.data(), entries_.size(), 0);
557
+ return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
552
558
  }
553
559
 
554
560
  template<typename S, typename A>
555
561
  auto compact_tuple_sketch<S, A>::end() -> iterator {
556
- return iterator(nullptr, 0, entries_.size());
562
+ return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
557
563
  }
558
564
 
559
565
  template<typename S, typename A>
560
566
  auto compact_tuple_sketch<S, A>::begin() const -> const_iterator {
561
- return const_iterator(entries_.data(), entries_.size(), 0);
567
+ return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
562
568
  }
563
569
 
564
570
  template<typename S, typename A>
565
571
  auto compact_tuple_sketch<S, A>::end() const -> const_iterator {
566
- return const_iterator(nullptr, 0, entries_.size());
572
+ return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
567
573
  }
568
574
 
569
575
  template<typename S, typename A>
570
- void compact_tuple_sketch<S, A>::print_specifics(ostrstream&) const {}
576
+ void compact_tuple_sketch<S, A>::print_specifics(std::ostringstream&) const {}
571
577
 
572
578
  // builder
573
579
 
@@ -581,7 +587,7 @@ tuple_base_builder<builder, P, A>(policy, allocator) {}
581
587
 
582
588
  template<typename S, typename U, typename P, typename A>
583
589
  auto update_tuple_sketch<S, U, P, A>::builder::build() const -> update_tuple_sketch {
584
- return update_tuple_sketch(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
590
+ return update_tuple_sketch(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
585
591
  }
586
592
 
587
593
  } /* namespace datasketches */