datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -21,8 +21,8 @@ namespace datasketches {
21
21
 
22
22
  template<typename A>
23
23
  update_array_of_doubles_sketch_alloc<A>::update_array_of_doubles_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
24
- uint64_t theta, uint64_t seed, const array_of_doubles_update_policy<A>& policy, const A& allocator):
25
- Base(lg_cur_size, lg_nom_size, rf, theta, seed, policy, allocator) {}
24
+ float p, uint64_t theta, uint64_t seed, const array_of_doubles_update_policy<A>& policy, const A& allocator):
25
+ Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {}
26
26
 
27
27
 
28
28
  template<typename A>
@@ -43,7 +43,7 @@ tuple_base_builder<builder, array_of_doubles_update_policy<A>, A>(policy, alloca
43
43
 
44
44
  template<typename A>
45
45
  update_array_of_doubles_sketch_alloc<A> update_array_of_doubles_sketch_alloc<A>::builder::build() const {
46
- return update_array_of_doubles_sketch_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
46
+ return update_array_of_doubles_sketch_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
47
47
  }
48
48
 
49
49
  // compact sketch
@@ -70,33 +70,33 @@ uint8_t compact_array_of_doubles_sketch_alloc<A>::get_num_values() const {
70
70
  template<typename A>
71
71
  void compact_array_of_doubles_sketch_alloc<A>::serialize(std::ostream& os) const {
72
72
  const uint8_t preamble_longs = 1;
73
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
73
+ write(os, preamble_longs);
74
74
  const uint8_t serial_version = SERIAL_VERSION;
75
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
75
+ write(os, serial_version);
76
76
  const uint8_t family = SKETCH_FAMILY;
77
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
77
+ write(os, family);
78
78
  const uint8_t type = SKETCH_TYPE;
79
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
79
+ write(os, type);
80
80
  const uint8_t flags_byte(
81
81
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
82
82
  (this->get_num_retained() > 0 ? 1 << flags::HAS_ENTRIES : 0) |
83
83
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
84
84
  );
85
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
86
- os.write(reinterpret_cast<const char*>(&num_values_), sizeof(num_values_));
85
+ write(os, flags_byte);
86
+ write(os, num_values_);
87
87
  const uint16_t seed_hash = this->get_seed_hash();
88
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
89
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
88
+ write(os, seed_hash);
89
+ write(os, this->theta_);
90
90
  if (this->get_num_retained() > 0) {
91
- const uint32_t num_entries = this->entries_.size();
92
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
91
+ const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
92
+ write(os, num_entries);
93
93
  const uint32_t unused32 = 0;
94
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
94
+ write(os, unused32);
95
95
  for (const auto& it: this->entries_) {
96
- os.write(reinterpret_cast<const char*>(&it.first), sizeof(uint64_t));
96
+ write(os, it.first);
97
97
  }
98
98
  for (const auto& it: this->entries_) {
99
- os.write(reinterpret_cast<const char*>(it.second.data()), it.second.size() * sizeof(double));
99
+ write(os, it.second.data(), it.second.size() * sizeof(double));
100
100
  }
101
101
  }
102
102
  }
@@ -110,30 +110,29 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
110
110
  vector_bytes bytes(size, 0, this->entries_.get_allocator());
111
111
  uint8_t* ptr = bytes.data() + header_size_bytes;
112
112
 
113
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
113
+ ptr += copy_to_mem(preamble_longs, ptr);
114
114
  const uint8_t serial_version = SERIAL_VERSION;
115
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
115
+ ptr += copy_to_mem(serial_version, ptr);
116
116
  const uint8_t family = SKETCH_FAMILY;
117
- ptr += copy_to_mem(&family, ptr, sizeof(family));
117
+ ptr += copy_to_mem(family, ptr);
118
118
  const uint8_t type = SKETCH_TYPE;
119
- ptr += copy_to_mem(&type, ptr, sizeof(type));
119
+ ptr += copy_to_mem(type, ptr);
120
120
  const uint8_t flags_byte(
121
121
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
122
122
  (this->get_num_retained() ? 1 << flags::HAS_ENTRIES : 0) |
123
123
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
124
124
  );
125
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
126
- ptr += copy_to_mem(&num_values_, ptr, sizeof(num_values_));
125
+ ptr += copy_to_mem(flags_byte, ptr);
126
+ ptr += copy_to_mem(num_values_, ptr);
127
127
  const uint16_t seed_hash = this->get_seed_hash();
128
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
129
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
128
+ ptr += copy_to_mem(seed_hash, ptr);
129
+ ptr += copy_to_mem((this->theta_), ptr);
130
130
  if (this->get_num_retained() > 0) {
131
- const uint32_t num_entries = this->entries_.size();
132
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
133
- const uint32_t unused32 = 0;
134
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
131
+ const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
132
+ ptr += copy_to_mem(num_entries, ptr);
133
+ ptr += sizeof(uint32_t); // unused
135
134
  for (const auto& it: this->entries_) {
136
- ptr += copy_to_mem(&it.first, ptr, sizeof(uint64_t));
135
+ ptr += copy_to_mem(it.first, ptr);
137
136
  }
138
137
  for (const auto& it: this->entries_) {
139
138
  ptr += copy_to_mem(it.second.data(), ptr, it.second.size() * sizeof(double));
@@ -144,40 +143,30 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
144
143
 
145
144
  template<typename A>
146
145
  compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
147
- uint8_t preamble_longs;
148
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
149
- uint8_t serial_version;
150
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
151
- uint8_t family;
152
- is.read(reinterpret_cast<char*>(&family), sizeof(family));
153
- uint8_t type;
154
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
155
- uint8_t flags_byte;
156
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
157
- uint8_t num_values;
158
- is.read(reinterpret_cast<char*>(&num_values), sizeof(num_values));
159
- uint16_t seed_hash;
160
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
146
+ read<uint8_t>(is); // unused
147
+ const auto serial_version = read<uint8_t>(is);
148
+ const auto family = read<uint8_t>(is);
149
+ const auto type = read<uint8_t>(is);
150
+ const auto flags_byte = read<uint8_t>(is);
151
+ const auto num_values = read<uint8_t>(is);
152
+ const auto seed_hash = read<uint16_t>(is);
161
153
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
162
154
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
163
155
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
164
156
  const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
165
157
  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
166
158
 
167
- uint64_t theta;
168
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
159
+ const auto theta = read<uint64_t>(is);
169
160
  std::vector<Entry, AllocEntry> entries(allocator);
170
161
  if (has_entries) {
171
- uint32_t num_entries;
172
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
173
- uint32_t unused32;
174
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
162
+ const auto num_entries = read<uint32_t>(is);
163
+ read<uint32_t>(is); // unused
175
164
  entries.reserve(num_entries);
176
165
  std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
177
- is.read(reinterpret_cast<char*>(keys.data()), num_entries * sizeof(uint64_t));
166
+ read(is, keys.data(), num_entries * sizeof(uint64_t));
178
167
  for (size_t i = 0; i < num_entries; ++i) {
179
168
  aod<A> summary(num_values, allocator);
180
- is.read(reinterpret_cast<char*>(summary.data()), num_values * sizeof(double));
169
+ read(is, summary.data(), num_values * sizeof(double));
181
170
  entries.push_back(Entry(keys[i], std::move(summary)));
182
171
  }
183
172
  }
@@ -191,20 +180,19 @@ template<typename A>
191
180
  compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
192
181
  ensure_minimum_memory(size, 16);
193
182
  const char* ptr = static_cast<const char*>(bytes);
194
- uint8_t preamble_longs;
195
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
183
+ ptr += sizeof(uint8_t); // unused
196
184
  uint8_t serial_version;
197
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
185
+ ptr += copy_from_mem(ptr, serial_version);
198
186
  uint8_t family;
199
- ptr += copy_from_mem(ptr, &family, sizeof(family));
187
+ ptr += copy_from_mem(ptr, family);
200
188
  uint8_t type;
201
- ptr += copy_from_mem(ptr, &type, sizeof(type));
189
+ ptr += copy_from_mem(ptr, type);
202
190
  uint8_t flags_byte;
203
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
191
+ ptr += copy_from_mem(ptr, flags_byte);
204
192
  uint8_t num_values;
205
- ptr += copy_from_mem(ptr, &num_values, sizeof(num_values));
193
+ ptr += copy_from_mem(ptr, num_values);
206
194
  uint16_t seed_hash;
207
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
195
+ ptr += copy_from_mem(ptr, seed_hash);
208
196
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
209
197
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
210
198
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
@@ -212,14 +200,13 @@ compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A
212
200
  if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
213
201
 
214
202
  uint64_t theta;
215
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
203
+ ptr += copy_from_mem(ptr, theta);
216
204
  std::vector<Entry, AllocEntry> entries(allocator);
217
205
  if (has_entries) {
218
206
  ensure_minimum_memory(size, 24);
219
207
  uint32_t num_entries;
220
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
221
- uint32_t unused32;
222
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
208
+ ptr += copy_from_mem(ptr, num_entries);
209
+ ptr += sizeof(uint32_t); // unused
223
210
  ensure_minimum_memory(size, 24 + (sizeof(uint64_t) + sizeof(double) * num_values) * num_entries);
224
211
  entries.reserve(num_entries);
225
212
  std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
@@ -61,7 +61,7 @@ public:
61
61
 
62
62
  private:
63
63
  // for builder
64
- array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
64
+ array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
65
65
  };
66
66
 
67
67
  template<typename Allocator>
@@ -20,8 +20,8 @@
20
20
  namespace datasketches {
21
21
 
22
22
  template<typename A>
23
- array_of_doubles_union_alloc<A>::array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const A& allocator):
24
- Base(lg_cur_size, lg_nom_size, rf, theta, seed, policy, allocator)
23
+ array_of_doubles_union_alloc<A>::array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const A& allocator):
24
+ Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator)
25
25
  {}
26
26
 
27
27
  template<typename A>
@@ -37,7 +37,7 @@ tuple_base_builder<builder, Policy, A>(policy, allocator) {}
37
37
 
38
38
  template<typename A>
39
39
  array_of_doubles_union_alloc<A> array_of_doubles_union_alloc<A>::builder::build() const {
40
- return array_of_doubles_union_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
40
+ return array_of_doubles_union_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
41
41
  }
42
42
 
43
43
  } /* namespace datasketches */
@@ -153,8 +153,7 @@ public:
153
153
  virtual const_iterator end() const = 0;
154
154
 
155
155
  protected:
156
- using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
157
- virtual void print_specifics(ostrstream& os) const = 0;
156
+ virtual void print_specifics(std::ostringstream& os) const = 0;
158
157
 
159
158
  static uint16_t get_seed_hash(uint64_t seed);
160
159
 
@@ -325,6 +324,11 @@ public:
325
324
  */
326
325
  void trim();
327
326
 
327
+ /**
328
+ * Reset the sketch to the initial empty state
329
+ */
330
+ void reset();
331
+
328
332
  /**
329
333
  * Converts this sketch to a compact sketch (ordered or unordered).
330
334
  * @param ordered optional flag to specify if ordered sketch should be produced
@@ -342,10 +346,9 @@ protected:
342
346
  tuple_map map_;
343
347
 
344
348
  // for builder
345
- update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
349
+ update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
346
350
 
347
- using ostrstream = typename Base::ostrstream;
348
- virtual void print_specifics(ostrstream& os) const;
351
+ virtual void print_specifics(std::ostringstream& os) const;
349
352
  };
350
353
 
351
354
  // compact sketch
@@ -367,9 +370,11 @@ public:
367
370
  using vector_bytes = std::vector<uint8_t, AllocBytes>;
368
371
  using comparator = compare_by_key<ExtractKey>;
369
372
 
370
- static const uint8_t SERIAL_VERSION = 1;
373
+ static const uint8_t SERIAL_VERSION_LEGACY = 1;
374
+ static const uint8_t SERIAL_VERSION = 3;
371
375
  static const uint8_t SKETCH_FAMILY = 9;
372
- static const uint8_t SKETCH_TYPE = 5;
376
+ static const uint8_t SKETCH_TYPE = 1;
377
+ static const uint8_t SKETCH_TYPE_LEGACY = 5;
373
378
  enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
374
379
 
375
380
  // Instances of this type can be obtained:
@@ -473,8 +478,7 @@ protected:
473
478
  bool destroy_;
474
479
  };
475
480
 
476
- using ostrstream = typename Base::ostrstream;
477
- virtual void print_specifics(ostrstream& os) const;
481
+ virtual void print_specifics(std::ostringstream& os) const;
478
482
 
479
483
  };
480
484
 
@@ -53,7 +53,9 @@ double tuple_sketch<S, A>::get_upper_bound(uint8_t num_std_devs) const {
53
53
 
54
54
  template<typename S, typename A>
55
55
  string<A> tuple_sketch<S, A>::to_string(bool detail) const {
56
- ostrstream os;
56
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
57
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
58
+ std::ostringstream os;
57
59
  os << "### Tuple sketch summary:" << std::endl;
58
60
  os << " num retained entries : " << get_num_retained() << std::endl;
59
61
  os << " seed hash : " << get_seed_hash() << std::endl;
@@ -74,15 +76,15 @@ string<A> tuple_sketch<S, A>::to_string(bool detail) const {
74
76
  }
75
77
  os << "### End retained entries" << std::endl;
76
78
  }
77
- return os.str();
79
+ return string<A>(os.str().c_str(), get_allocator());
78
80
  }
79
81
 
80
82
  // update sketch
81
83
 
82
84
  template<typename S, typename U, typename P, typename A>
83
- update_tuple_sketch<S, U, P, A>::update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
85
+ update_tuple_sketch<S, U, P, A>::update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
84
86
  policy_(policy),
85
- map_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
87
+ map_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
86
88
  {}
87
89
 
88
90
  template<typename S, typename U, typename P, typename A>
@@ -97,12 +99,12 @@ bool update_tuple_sketch<S, U, P, A>::is_empty() const {
97
99
 
98
100
  template<typename S, typename U, typename P, typename A>
99
101
  bool update_tuple_sketch<S, U, P, A>::is_ordered() const {
100
- return false;
102
+ return map_.num_entries_ > 1 ? false : true;;
101
103
  }
102
104
 
103
105
  template<typename S, typename U, typename P, typename A>
104
106
  uint64_t update_tuple_sketch<S, U, P, A>::get_theta64() const {
105
- return map_.theta_;
107
+ return is_empty() ? theta_constants::MAX_THETA : map_.theta_;
106
108
  }
107
109
 
108
110
  template<typename S, typename U, typename P, typename A>
@@ -212,6 +214,11 @@ void update_tuple_sketch<S, U, P, A>::trim() {
212
214
  map_.trim();
213
215
  }
214
216
 
217
+ template<typename S, typename U, typename P, typename A>
218
+ void update_tuple_sketch<S, U, P, A>::reset() {
219
+ map_.reset();
220
+ }
221
+
215
222
  template<typename S, typename U, typename P, typename A>
216
223
  auto update_tuple_sketch<S, U, P, A>::begin() -> iterator {
217
224
  return iterator(map_.entries_, 1 << map_.lg_cur_size_, 0);
@@ -238,7 +245,7 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered
238
245
  }
239
246
 
240
247
  template<typename S, typename U, typename P, typename A>
241
- void update_tuple_sketch<S, U, P, A>::print_specifics(ostrstream& os) const {
248
+ void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
242
249
  os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
243
250
  os << " lg current size : " << (int) map_.lg_cur_size_ << std::endl;
244
251
  os << " resize factor : " << (1 << map_.rf_) << std::endl;
@@ -250,7 +257,7 @@ template<typename S, typename A>
250
257
  compact_tuple_sketch<S, A>::compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
251
258
  std::vector<Entry, AllocEntry>&& entries):
252
259
  is_empty_(is_empty),
253
- is_ordered_(is_ordered),
260
+ is_ordered_(is_ordered || (entries.size() <= 1ULL)),
254
261
  seed_hash_(seed_hash),
255
262
  theta_(theta),
256
263
  entries_(std::move(entries))
@@ -315,7 +322,7 @@ uint64_t compact_tuple_sketch<S, A>::get_theta64() const {
315
322
 
316
323
  template<typename S, typename A>
317
324
  uint32_t compact_tuple_sketch<S, A>::get_num_retained() const {
318
- return entries_.size();
325
+ return static_cast<uint32_t>(entries_.size());
319
326
  }
320
327
 
321
328
  template<typename S, typename A>
@@ -347,36 +354,36 @@ template<typename SerDe>
347
354
  void compact_tuple_sketch<S, A>::serialize(std::ostream& os, const SerDe& sd) const {
348
355
  const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
349
356
  const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
350
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
357
+ write(os, preamble_longs);
351
358
  const uint8_t serial_version = SERIAL_VERSION;
352
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
359
+ write(os, serial_version);
353
360
  const uint8_t family = SKETCH_FAMILY;
354
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
361
+ write(os, family);
355
362
  const uint8_t type = SKETCH_TYPE;
356
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
363
+ write(os, type);
357
364
  const uint8_t unused8 = 0;
358
- os.write(reinterpret_cast<const char*>(&unused8), sizeof(unused8));
365
+ write(os, unused8);
359
366
  const uint8_t flags_byte(
360
367
  (1 << flags::IS_COMPACT) |
361
368
  (1 << flags::IS_READ_ONLY) |
362
369
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
363
370
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
364
371
  );
365
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
372
+ write(os, flags_byte);
366
373
  const uint16_t seed_hash = get_seed_hash();
367
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
374
+ write(os, seed_hash);
368
375
  if (!this->is_empty()) {
369
376
  if (!is_single_item) {
370
- const uint32_t num_entries = entries_.size();
371
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
377
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
378
+ write(os, num_entries);
372
379
  const uint32_t unused32 = 0;
373
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
380
+ write(os, unused32);
374
381
  if (this->is_estimation_mode()) {
375
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
382
+ write(os, this->theta_);
376
383
  }
377
384
  }
378
385
  for (const auto& it: entries_) {
379
- os.write(reinterpret_cast<const char*>(&it.first), sizeof(uint64_t));
386
+ write(os, it.first);
380
387
  sd.serialize(os, &it.second, 1);
381
388
  }
382
389
  }
@@ -393,36 +400,34 @@ auto compact_tuple_sketch<S, A>::serialize(unsigned header_size_bytes, const Ser
393
400
  uint8_t* ptr = bytes.data() + header_size_bytes;
394
401
  const uint8_t* end_ptr = ptr + size;
395
402
 
396
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
403
+ ptr += copy_to_mem(preamble_longs, ptr);
397
404
  const uint8_t serial_version = SERIAL_VERSION;
398
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
405
+ ptr += copy_to_mem(serial_version, ptr);
399
406
  const uint8_t family = SKETCH_FAMILY;
400
- ptr += copy_to_mem(&family, ptr, sizeof(family));
407
+ ptr += copy_to_mem(family, ptr);
401
408
  const uint8_t type = SKETCH_TYPE;
402
- ptr += copy_to_mem(&type, ptr, sizeof(type));
403
- const uint8_t unused8 = 0;
404
- ptr += copy_to_mem(&unused8, ptr, sizeof(unused8));
409
+ ptr += copy_to_mem(type, ptr);
410
+ ptr += sizeof(uint8_t); // unused
405
411
  const uint8_t flags_byte(
406
412
  (1 << flags::IS_COMPACT) |
407
413
  (1 << flags::IS_READ_ONLY) |
408
414
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
409
415
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
410
416
  );
411
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
417
+ ptr += copy_to_mem(flags_byte, ptr);
412
418
  const uint16_t seed_hash = get_seed_hash();
413
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
419
+ ptr += copy_to_mem(seed_hash, ptr);
414
420
  if (!this->is_empty()) {
415
421
  if (!is_single_item) {
416
- const uint32_t num_entries = entries_.size();
417
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
418
- const uint32_t unused32 = 0;
419
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
422
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
423
+ ptr += copy_to_mem(num_entries, ptr);
424
+ ptr += sizeof(uint32_t); // unused
420
425
  if (this->is_estimation_mode()) {
421
- ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
426
+ ptr += copy_to_mem(theta_, ptr);
422
427
  }
423
428
  }
424
429
  for (const auto& it: entries_) {
425
- ptr += copy_to_mem(&it.first, ptr, sizeof(uint64_t));
430
+ ptr += copy_to_mem(it.first, ptr);
426
431
  ptr += sd.serialize(ptr, end_ptr - ptr, &it.second, 1);
427
432
  }
428
433
  }
@@ -432,23 +437,22 @@ auto compact_tuple_sketch<S, A>::serialize(unsigned header_size_bytes, const Ser
432
437
  template<typename S, typename A>
433
438
  template<typename SerDe>
434
439
  compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(std::istream& is, uint64_t seed, const SerDe& sd, const A& allocator) {
435
- uint8_t preamble_longs;
436
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
437
- uint8_t serial_version;
438
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
439
- uint8_t family;
440
- is.read(reinterpret_cast<char*>(&family), sizeof(family));
441
- uint8_t type;
442
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
443
- uint8_t unused8;
444
- is.read(reinterpret_cast<char*>(&unused8), sizeof(unused8));
445
- uint8_t flags_byte;
446
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
447
- uint16_t seed_hash;
448
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
449
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
440
+ const auto preamble_longs = read<uint8_t>(is);
441
+ const auto serial_version = read<uint8_t>(is);
442
+ const auto family = read<uint8_t>(is);
443
+ const auto type = read<uint8_t>(is);
444
+ read<uint8_t>(is); // unused
445
+ const auto flags_byte = read<uint8_t>(is);
446
+ const auto seed_hash = read<uint16_t>(is);
447
+ if (serial_version != SERIAL_VERSION && serial_version != SERIAL_VERSION_LEGACY) {
448
+ throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + " or "
449
+ + std::to_string(SERIAL_VERSION_LEGACY) + ", actual " + std::to_string(serial_version));
450
+ }
450
451
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
451
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
452
+ if (type != SKETCH_TYPE && type != SKETCH_TYPE_LEGACY) {
453
+ throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + " or "
454
+ + std::to_string(SKETCH_TYPE_LEGACY) + ", actual " + std::to_string(type));
455
+ }
452
456
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
453
457
  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
454
458
 
@@ -458,11 +462,10 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(std::istream&
458
462
  if (preamble_longs == 1) {
459
463
  num_entries = 1;
460
464
  } else {
461
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
462
- uint32_t unused32;
463
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
465
+ num_entries = read<uint32_t>(is);
466
+ read<uint32_t>(is); // unused
464
467
  if (preamble_longs > 2) {
465
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
468
+ theta = read<uint64_t>(is);
466
469
  }
467
470
  }
468
471
  }
@@ -472,8 +475,7 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(std::istream&
472
475
  entries.reserve(num_entries);
473
476
  std::unique_ptr<S, deleter_of_summaries> summary(alloc.allocate(1), deleter_of_summaries(1, false, allocator));
474
477
  for (size_t i = 0; i < num_entries; ++i) {
475
- uint64_t key;
476
- is.read(reinterpret_cast<char*>(&key), sizeof(uint64_t));
478
+ const auto key = read<uint64_t>(is);
477
479
  sd.deserialize(is, summary.get(), 1);
478
480
  entries.push_back(Entry(key, std::move(*summary)));
479
481
  (*summary).~S();
@@ -491,22 +493,27 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
491
493
  const char* ptr = static_cast<const char*>(bytes);
492
494
  const char* base = ptr;
493
495
  uint8_t preamble_longs;
494
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
496
+ ptr += copy_from_mem(ptr, preamble_longs);
495
497
  uint8_t serial_version;
496
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
498
+ ptr += copy_from_mem(ptr, serial_version);
497
499
  uint8_t family;
498
- ptr += copy_from_mem(ptr, &family, sizeof(family));
500
+ ptr += copy_from_mem(ptr, family);
499
501
  uint8_t type;
500
- ptr += copy_from_mem(ptr, &type, sizeof(type));
501
- uint8_t unused8;
502
- ptr += copy_from_mem(ptr, &unused8, sizeof(unused8));
502
+ ptr += copy_from_mem(ptr, type);
503
+ ptr += sizeof(uint8_t); // unused
503
504
  uint8_t flags_byte;
504
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
505
+ ptr += copy_from_mem(ptr, flags_byte);
505
506
  uint16_t seed_hash;
506
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
507
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
507
+ ptr += copy_from_mem(ptr, seed_hash);
508
+ if (serial_version != SERIAL_VERSION && serial_version != SERIAL_VERSION_LEGACY) {
509
+ throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + " or "
510
+ + std::to_string(SERIAL_VERSION_LEGACY) + ", actual " + std::to_string(serial_version));
511
+ }
508
512
  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
509
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
513
+ if (type != SKETCH_TYPE && type != SKETCH_TYPE_LEGACY) {
514
+ throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + " or "
515
+ + std::to_string(SKETCH_TYPE_LEGACY) + ", actual " + std::to_string(type));
516
+ }
510
517
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
511
518
  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
512
519
 
@@ -518,12 +525,11 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
518
525
  num_entries = 1;
519
526
  } else {
520
527
  ensure_minimum_memory(size, 8); // read the first prelong before this method
521
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
522
- uint32_t unused32;
523
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
528
+ ptr += copy_from_mem(ptr, num_entries);
529
+ ptr += sizeof(uint32_t); // unused
524
530
  if (preamble_longs > 2) {
525
531
  ensure_minimum_memory(size, (preamble_longs - 1) << 3);
526
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
532
+ ptr += copy_from_mem(ptr, theta);
527
533
  }
528
534
  }
529
535
  }
@@ -536,7 +542,7 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
536
542
  std::unique_ptr<S, deleter_of_summaries> summary(alloc.allocate(1), deleter_of_summaries(1, false, allocator));
537
543
  for (size_t i = 0; i < num_entries; ++i) {
538
544
  uint64_t key;
539
- ptr += copy_from_mem(ptr, &key, sizeof(key));
545
+ ptr += copy_from_mem(ptr, key);
540
546
  ptr += sd.deserialize(ptr, base + size - ptr, summary.get(), 1);
541
547
  entries.push_back(Entry(key, std::move(*summary)));
542
548
  (*summary).~S();
@@ -548,26 +554,26 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
548
554
 
549
555
  template<typename S, typename A>
550
556
  auto compact_tuple_sketch<S, A>::begin() -> iterator {
551
- return iterator(entries_.data(), entries_.size(), 0);
557
+ return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
552
558
  }
553
559
 
554
560
  template<typename S, typename A>
555
561
  auto compact_tuple_sketch<S, A>::end() -> iterator {
556
- return iterator(nullptr, 0, entries_.size());
562
+ return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
557
563
  }
558
564
 
559
565
  template<typename S, typename A>
560
566
  auto compact_tuple_sketch<S, A>::begin() const -> const_iterator {
561
- return const_iterator(entries_.data(), entries_.size(), 0);
567
+ return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
562
568
  }
563
569
 
564
570
  template<typename S, typename A>
565
571
  auto compact_tuple_sketch<S, A>::end() const -> const_iterator {
566
- return const_iterator(nullptr, 0, entries_.size());
572
+ return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
567
573
  }
568
574
 
569
575
  template<typename S, typename A>
570
- void compact_tuple_sketch<S, A>::print_specifics(ostrstream&) const {}
576
+ void compact_tuple_sketch<S, A>::print_specifics(std::ostringstream&) const {}
571
577
 
572
578
  // builder
573
579
 
@@ -581,7 +587,7 @@ tuple_base_builder<builder, P, A>(policy, allocator) {}
581
587
 
582
588
  template<typename S, typename U, typename P, typename A>
583
589
  auto update_tuple_sketch<S, U, P, A>::builder::build() const -> update_tuple_sketch {
584
- return update_tuple_sketch(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
590
+ return update_tuple_sketch(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
585
591
  }
586
592
 
587
593
  } /* namespace datasketches */