datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -30,8 +30,8 @@ namespace datasketches {
30
30
  template<typename T, typename S, typename A>
31
31
  var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
32
32
  n_(0),
33
- outer_tau_numer_(0),
34
- outer_tau_denom_(0.0),
33
+ outer_tau_numer_(0.0),
34
+ outer_tau_denom_(0),
35
35
  max_k_(max_k),
36
36
  gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
37
37
  {}
@@ -129,16 +129,11 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
129
129
 
130
130
  template<typename T, typename S, typename A>
131
131
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
132
- uint8_t preamble_longs;
133
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
134
- uint8_t serial_version;
135
- is.read((char*)&serial_version, sizeof(serial_version));
136
- uint8_t family_id;
137
- is.read((char*)&family_id, sizeof(family_id));
138
- uint8_t flags;
139
- is.read((char*)&flags, sizeof(flags));
140
- uint32_t max_k;
141
- is.read((char*)&max_k, sizeof(max_k));
132
+ const auto preamble_longs = read<uint8_t>(is);
133
+ const auto serial_version = read<uint8_t>(is);
134
+ const auto family_id = read<uint8_t>(is);
135
+ const auto flags = read<uint8_t>(is);
136
+ const auto max_k = read<uint32_t>(is);
142
137
 
143
138
  check_preamble_longs(preamble_longs, flags);
144
139
  check_family_and_serialization_version(family_id, serial_version);
@@ -156,12 +151,9 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
156
151
  return var_opt_union<T,S,A>(max_k);
157
152
  }
158
153
 
159
- uint64_t items_seen;
160
- is.read((char*)&items_seen, sizeof(items_seen));
161
- double outer_tau_numer;
162
- is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
163
- uint64_t outer_tau_denom;
164
- is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
154
+ const auto items_seen = read<uint64_t>(is);
155
+ const auto outer_tau_numer = read<double>(is);
156
+ const auto outer_tau_denom = read<uint64_t>(is);
165
157
 
166
158
  var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
167
159
 
@@ -176,15 +168,15 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
176
168
  ensure_minimum_memory(size, 8);
177
169
  const char* ptr = static_cast<const char*>(bytes);
178
170
  uint8_t preamble_longs;
179
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
171
+ ptr += copy_from_mem(ptr, preamble_longs);
180
172
  uint8_t serial_version;
181
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
173
+ ptr += copy_from_mem(ptr, serial_version);
182
174
  uint8_t family_id;
183
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
175
+ ptr += copy_from_mem(ptr, family_id);
184
176
  uint8_t flags;
185
- ptr += copy_from_mem(ptr, &flags, sizeof(flags));
177
+ ptr += copy_from_mem(ptr, flags);
186
178
  uint32_t max_k;
187
- ptr += copy_from_mem(ptr, &max_k, sizeof(max_k));
179
+ ptr += copy_from_mem(ptr, max_k);
188
180
 
189
181
  check_preamble_longs(preamble_longs, flags);
190
182
  check_family_and_serialization_version(family_id, serial_version);
@@ -200,11 +192,11 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
200
192
  }
201
193
 
202
194
  uint64_t items_seen;
203
- ptr += copy_from_mem(ptr, &items_seen, sizeof(items_seen));
195
+ ptr += copy_from_mem(ptr, items_seen);
204
196
  double outer_tau_numer;
205
- ptr += copy_from_mem(ptr, &outer_tau_numer, sizeof(outer_tau_numer));
197
+ ptr += copy_from_mem(ptr, outer_tau_numer);
206
198
  uint64_t outer_tau_denom;
207
- ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
199
+ ptr += copy_from_mem(ptr, outer_tau_denom);
208
200
 
209
201
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
210
202
  var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
@@ -238,16 +230,16 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
238
230
  flags = 0;
239
231
  }
240
232
 
241
- os.write((char*) &preamble_longs, sizeof(uint8_t));
242
- os.write((char*) &serialization_version, sizeof(uint8_t));
243
- os.write((char*) &family_id, sizeof(uint8_t));
244
- os.write((char*) &flags, sizeof(uint8_t));
245
- os.write((char*) &max_k_, sizeof(uint32_t));
233
+ write(os, preamble_longs);
234
+ write(os, serialization_version);
235
+ write(os, family_id);
236
+ write(os, flags);
237
+ write(os, max_k_);
246
238
 
247
239
  if (!empty) {
248
- os.write((char*) &n_, sizeof(uint64_t));
249
- os.write((char*) &outer_tau_numer_, sizeof(double));
250
- os.write((char*) &outer_tau_denom_, sizeof(uint64_t));
240
+ write(os, n_);
241
+ write(os, outer_tau_numer_);
242
+ write(os, outer_tau_denom_);
251
243
  gadget_.serialize(os);
252
244
  }
253
245
  }
@@ -275,16 +267,16 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
275
267
  }
276
268
 
277
269
  // first prelong
278
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
279
- ptr += copy_to_mem(&serialization_version, ptr, sizeof(uint8_t));
280
- ptr += copy_to_mem(&family_id, ptr, sizeof(uint8_t));
281
- ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
282
- ptr += copy_to_mem(&max_k_, ptr, sizeof(uint32_t));
270
+ ptr += copy_to_mem(preamble_longs, ptr);
271
+ ptr += copy_to_mem(serialization_version, ptr);
272
+ ptr += copy_to_mem(family_id, ptr);
273
+ ptr += copy_to_mem(flags, ptr);
274
+ ptr += copy_to_mem(max_k_, ptr);
283
275
 
284
276
  if (!empty) {
285
- ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
286
- ptr += copy_to_mem(&outer_tau_numer_, ptr, sizeof(double));
287
- ptr += copy_to_mem(&outer_tau_denom_, ptr, sizeof(uint64_t));
277
+ ptr += copy_to_mem(n_, ptr);
278
+ ptr += copy_to_mem(outer_tau_numer_, ptr);
279
+ ptr += copy_to_mem(outer_tau_denom_, ptr);
288
280
 
289
281
  auto gadget_bytes = gadget_.serialize();
290
282
  ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
@@ -303,14 +295,16 @@ void var_opt_union<T,S,A>::reset() {
303
295
 
304
296
  template<typename T, typename S, typename A>
305
297
  string<A> var_opt_union<T,S,A>::to_string() const {
306
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
307
- os << "### VarOpt Union SUMMARY: " << std::endl;
308
- os << " . n : " << n_ << std::endl;
298
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
299
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
300
+ std::ostringstream os;
301
+ os << "### VarOpt Union SUMMARY:" << std::endl;
302
+ os << " n : " << n_ << std::endl;
309
303
  os << " Max k : " << max_k_ << std::endl;
310
- os << " Gadget Summary: " << std::endl;
304
+ os << " Gadget Summary:" << std::endl;
311
305
  os << gadget_.to_string();
312
- os << "### END VarOpt Union SUMMARY: " << std::endl;
313
- return os.str();
306
+ os << "### END VarOpt Union SUMMARY" << std::endl;
307
+ return string<A>(os.str().c_str(), gadget_.allocator_);
314
308
  }
315
309
 
316
310
  template<typename T, typename S, typename A>
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
41
41
  static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
42
42
  var_opt_sketch<int> sk(k);
43
43
  for (uint64_t i = 0; i < n; ++i) {
44
- sk.update(i, 1.0);
44
+ sk.update(static_cast<int>(i), 1.0);
45
45
  }
46
46
  return sk;
47
47
  }
@@ -71,7 +71,7 @@ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk
71
71
 
72
72
  TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
73
73
  REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
74
- REQUIRE_THROWS_AS(var_opt_sketch<int>(1 << 31), std::invalid_argument); // aka k < 0
74
+ REQUIRE_THROWS_AS(var_opt_sketch<int>(1U << 31), std::invalid_argument); // aka k < 0
75
75
  }
76
76
 
77
77
  TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
@@ -216,11 +216,11 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
216
216
  // which covers about 10 orders of magnitude
217
217
  double w = std::exp(5 * N(rand));
218
218
  input_sum += w;
219
- sk.update(i, w);
219
+ sk.update(static_cast<int>(i), w);
220
220
  }
221
221
 
222
222
  double output_sum = 0.0;
223
- for (auto& it : sk) { // std::pair<int, weight>
223
+ for (auto it : sk) { // std::pair<int, weight>
224
224
  output_sum += it.second;
225
225
  }
226
226
 
@@ -350,7 +350,7 @@ TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
350
350
  // Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
351
351
  // added k-1 heavy items, leaving only 1 item left in R
352
352
  for (uint32_t i = 1; i <= k; ++i) {
353
- sk.update(-i, k + (i * wt_scale));
353
+ sk.update(-1 * static_cast<int>(i), k + (i * wt_scale));
354
354
  }
355
355
 
356
356
  auto it = sk.begin();
@@ -442,7 +442,7 @@ TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
442
442
  // finally, a non-degenerate predicate
443
443
  // insert negative items with identical weights, filter for negative weights only
444
444
  for (uint32_t i = 1; i <= (k + 1); ++i) {
445
- sk.update(static_cast<int32_t>(-i), 1.0 * i);
445
+ sk.update(-1 * static_cast<int32_t>(i), static_cast<double>(i));
446
446
  total_weight += 1.0 * i;
447
447
  }
448
448
 
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
41
41
  static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
42
42
  var_opt_sketch<int> sk(k);
43
43
  for (uint64_t i = 0; i < n; ++i) {
44
- sk.update(i, 1.0);
44
+ sk.update(static_cast<int>(i), 1.0);
45
45
  }
46
46
  return sk;
47
47
  }
@@ -147,7 +147,7 @@ TEST_CASE("varopt union: bad serialization version", "[var_opt_union]") {
147
147
 
148
148
  TEST_CASE("varopt union: invalid k", "[var_opt_union]") {
149
149
  REQUIRE_THROWS_AS(var_opt_union<int>(0), std::invalid_argument);
150
- REQUIRE_THROWS_AS(var_opt_union<int>(1<<31), std::invalid_argument);
150
+ REQUIRE_THROWS_AS(var_opt_union<int>(1U << 31), std::invalid_argument);
151
151
  }
152
152
 
153
153
  TEST_CASE("varopt union: bad family", "[var_opt_union]") {
@@ -179,13 +179,13 @@ TEST_CASE("varopt union: empty union", "[var_opt_union]") {
179
179
  }
180
180
 
181
181
  TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
182
- uint64_t n = 4; // 2n < k
182
+ int n = 4; // 2n < k
183
183
  uint32_t k = 10;
184
184
  var_opt_sketch<int> sk1(k), sk2(k);
185
185
 
186
- for (uint64_t i = 1; i <= n; ++i) {
187
- sk1.update(i, i);
188
- sk2.update(static_cast<int64_t>(-i), i);
186
+ for (int i = 1; i <= n; ++i) {
187
+ sk1.update(i, static_cast<double>(i));
188
+ sk2.update(-i, static_cast<double>(i));
189
189
  }
190
190
 
191
191
  var_opt_union<int> u(k);
@@ -193,7 +193,7 @@ TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
193
193
  u.update(sk2);
194
194
 
195
195
  var_opt_sketch<int> result = u.get_result();
196
- REQUIRE(result.get_n() == 2 * n);
196
+ REQUIRE(result.get_n() == 2ULL * n);
197
197
  REQUIRE(result.get_k() == k);
198
198
  }
199
199
 
@@ -204,13 +204,13 @@ TEST_CASE("varopt union: heavy sampling sketch", "[var_opt_union]") {
204
204
  uint32_t k2 = 5;
205
205
  var_opt_sketch<int64_t> sk1(k1), sk2(k2);
206
206
  for (uint64_t i = 1; i <= n1; ++i) {
207
- sk1.update(i, i);
207
+ sk1.update(i, static_cast<double>(i));
208
208
  }
209
209
 
210
210
  for (uint64_t i = 1; i < n2; ++i) { // we'll add a very heavy one later
211
- sk2.update(static_cast<int64_t>(-i), i + 1000.0);
211
+ sk2.update(-1 * static_cast<int64_t>(i), i + 1000.0);
212
212
  }
213
- sk2.update(-n2, 1000000.0);
213
+ sk2.update(-1 * static_cast<int64_t>(n2), 1000000.0);
214
214
 
215
215
  var_opt_union<int64_t> u(k1);
216
216
  u.update(sk1);
@@ -258,15 +258,15 @@ TEST_CASE("varopt union: small sampling sketch", "[var_opt_union]") {
258
258
  uint64_t n2 = 64;
259
259
 
260
260
  var_opt_sketch<float> sk(k_small);
261
- for (uint64_t i = 0; i < n1; ++i) { sk.update(i); }
262
- sk.update(-1, n1 * n1); // add a heavy item
261
+ for (uint64_t i = 0; i < n1; ++i) { sk.update(static_cast<float>(i)); }
262
+ sk.update(-1.0f, static_cast<double>(n1 * n1)); // add a heavy item
263
263
 
264
264
  var_opt_union<float> u(k_max);
265
265
  u.update(sk);
266
266
 
267
267
  // another one, but different n to get a different per-item weight
268
268
  var_opt_sketch<float> sk2(k_small);
269
- for (uint64_t i = 0; i < n2; ++i) { sk2.update(i); }
269
+ for (uint64_t i = 0; i < n2; ++i) { sk2.update(static_cast<float>(i)); }
270
270
  u.update(sk2);
271
271
 
272
272
  // should trigger migrate_marked_items_by_decreasing_k()
@@ -49,8 +49,9 @@ class CMakeBuild(build_ext):
49
49
  os.path.dirname(self.get_ext_fullpath(ext.name)))
50
50
  cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir]
51
51
  cmake_args += ['-DWITH_PYTHON=True']
52
+ cmake_args += ['-DCMAKE_CXX_STANDARD=11']
52
53
  # ensure we use a consistent python version
53
- cmake_args += ['-DPYTHON_EXECUTABLE=' + sys.executable]
54
+ cmake_args += ['-DPython3_EXECUTABLE=' + sys.executable]
54
55
  cfg = 'Debug' if self.debug else 'Release'
55
56
  build_args = ['--config', cfg]
56
57
 
@@ -59,7 +60,8 @@ class CMakeBuild(build_ext):
59
60
  cfg.upper(),
60
61
  extdir)]
61
62
  if sys.maxsize > 2**32:
62
- cmake_args += ['-A', 'x64']
63
+ cmake_args += ['-T', 'host=x64']
64
+ cmake_args += ['-DCMAKE_GENERATOR_PLATFORM=x64']
63
65
  build_args += ['--', '/m']
64
66
  else:
65
67
  cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
@@ -74,23 +76,24 @@ class CMakeBuild(build_ext):
74
76
  subprocess.check_call(['cmake', ext.sourcedir] + cmake_args,
75
77
  cwd=self.build_temp, env=env)
76
78
  subprocess.check_call(['cmake', '--build', '.', '--target', 'python'] + build_args,
77
- cwd=self.build_temp)
79
+ cwd=self.build_temp, env=env)
78
80
  print() # add an empty line to pretty print
79
81
 
80
82
  setup(
81
83
  name='datasketches',
82
- version='3.0.0',
83
- author='Apache DataSketches Developers',
84
+ version='3.3.0',
85
+ author='Apache Software Foundation',
84
86
  author_email='dev@datasketches.apache.org',
85
- description='A wrapper for the C++ Apache DataSketches library',
87
+ description='The Apache DataSketches Library for Python',
86
88
  license='Apache License 2.0',
87
89
  url='http://datasketches.apache.org',
88
90
  long_description=open('python/README.md').read(),
91
+ long_description_content_type='text/markdown',
89
92
  packages=find_packages('python'), # python pacakges only in this dir
90
93
  package_dir={'':'python'},
91
94
  # may need to add all source paths for sdist packages w/o MANIFEST.in
92
95
  ext_modules=[CMakeExtension('datasketches')],
93
96
  cmdclass={'build_ext': CMakeBuild},
94
- setup_requires=['setuptools_scm','tox-setuptools'],
97
+ install_requires=['numpy'],
95
98
  zip_safe=False
96
99
  )
@@ -32,53 +32,34 @@ target_include_directories(theta
32
32
  target_link_libraries(theta INTERFACE common)
33
33
  target_compile_features(theta INTERFACE cxx_std_11)
34
34
 
35
- set(theta_HEADERS "")
36
- list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
37
- list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
38
- list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
39
- list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
40
- list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
41
- list(APPEND theta_HEADERS "include/theta_comparators.hpp")
42
- list(APPEND theta_HEADERS "include/theta_constants.hpp")
43
- list(APPEND theta_HEADERS "include/theta_helpers.hpp")
44
- list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
45
- list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
46
- list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
47
- list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
48
- list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
49
- list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
50
- list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
51
-
52
35
  install(TARGETS theta
53
36
  EXPORT ${PROJECT_NAME}
54
37
  )
55
38
 
56
- install(FILES ${theta_HEADERS}
39
+ install(FILES
40
+ include/theta_sketch.hpp
41
+ include/theta_sketch_impl.hpp
42
+ include/theta_union.hpp
43
+ include/theta_union_impl.hpp
44
+ include/theta_intersection.hpp
45
+ include/theta_intersection_impl.hpp
46
+ include/theta_a_not_b.hpp
47
+ include/theta_a_not_b_impl.hpp
48
+ include/theta_jaccard_similarity.hpp
49
+ include/theta_comparators.hpp
50
+ include/theta_constants.hpp
51
+ include/theta_helpers.hpp
52
+ include/theta_update_sketch_base.hpp
53
+ include/theta_update_sketch_base_impl.hpp
54
+ include/theta_union_base.hpp
55
+ include/theta_union_base_impl.hpp
56
+ include/theta_intersection_base.hpp
57
+ include/theta_intersection_base_impl.hpp
58
+ include/theta_set_difference_base.hpp
59
+ include/theta_set_difference_base_impl.hpp
60
+ include/theta_jaccard_similarity_base.hpp
61
+ include/bounds_on_ratios_in_sampled_sets.hpp
62
+ include/bounds_on_ratios_in_theta_sketched_sets.hpp
63
+ include/compact_theta_sketch_parser.hpp
64
+ include/compact_theta_sketch_parser_impl.hpp
57
65
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
58
-
59
- target_sources(theta
60
- INTERFACE
61
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
62
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
63
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
64
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
65
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
66
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
67
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
68
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
69
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
70
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
71
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
72
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
73
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
74
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
75
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
76
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
77
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
78
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
79
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
80
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
81
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
82
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
83
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
84
- )
@@ -90,7 +90,7 @@ public:
90
90
  * @param f the inclusion probability used to produce the set with size <i>a</i>.
91
91
  * @return the approximate lower bound
92
92
  */
93
- static double estimate_of_a(uint64_t a, uint64_t f) {
93
+ static double estimate_of_a(uint64_t a, double f) {
94
94
  check_inputs(a, 1, f);
95
95
  return a / f;
96
96
  }
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef COMPACT_THETA_SKETCH_PARSER_HPP_
21
+ #define COMPACT_THETA_SKETCH_PARSER_HPP_
22
+
23
+ #include <stdint.h>
24
+
25
+ namespace datasketches {
26
+
27
+ template<bool dummy>
28
+ class compact_theta_sketch_parser {
29
+ public:
30
+ struct compact_theta_sketch_data {
31
+ bool is_empty;
32
+ bool is_ordered;
33
+ uint16_t seed_hash;
34
+ uint32_t num_entries;
35
+ uint64_t theta;
36
+ const uint64_t* entries;
37
+ };
38
+
39
+ static compact_theta_sketch_data parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error = false);
40
+
41
+ private:
42
+ // offsets are in sizeof(type)
43
+ static const size_t COMPACT_SKETCH_PRE_LONGS_BYTE = 0;
44
+ static const size_t COMPACT_SKETCH_SERIAL_VERSION_BYTE = 1;
45
+ static const size_t COMPACT_SKETCH_TYPE_BYTE = 2;
46
+ static const size_t COMPACT_SKETCH_FLAGS_BYTE = 5;
47
+ static const size_t COMPACT_SKETCH_SEED_HASH_U16 = 3;
48
+ static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2;
49
+ static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1;
50
+ static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2;
51
+ static const size_t COMPACT_SKETCH_THETA_U64 = 2;
52
+ static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3;
53
+
54
+ static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
55
+ static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
56
+
57
+ static const uint8_t COMPACT_SKETCH_SERIAL_VERSION = 3;
58
+ static const uint8_t COMPACT_SKETCH_TYPE = 3;
59
+
60
+ static std::string hex_dump(const uint8_t* ptr, size_t size);
61
+ };
62
+
63
+ } /* namespace datasketches */
64
+
65
+ #include "compact_theta_sketch_parser_impl.hpp"
66
+
67
+ #endif
@@ -0,0 +1,137 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
21
+ #define COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
22
+
23
+ #include <iostream>
24
+ #include <iomanip>
25
+
26
+ namespace datasketches {
27
+
28
+ template<bool dummy>
29
+ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
30
+ if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
31
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
32
+
33
+ uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
34
+
35
+ switch(serial_version) {
36
+ case COMPACT_SKETCH_SERIAL_VERSION: {
37
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
38
+ uint64_t theta = theta_constants::MAX_THETA;
39
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
40
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
41
+ return {true, true, seed_hash, 0, theta, nullptr};
42
+ }
43
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
44
+ const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
45
+ if (has_theta) {
46
+ if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
47
+ theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
48
+ }
49
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
50
+ return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
51
+ }
52
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
53
+ const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
54
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
55
+ const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
56
+ if (size < expected_size_bytes) {
57
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
58
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
59
+ }
60
+ const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
61
+ return {false, is_ordered, seed_hash, num_entries, theta, entries};
62
+ }
63
+ case 1: {
64
+ uint16_t seed_hash = compute_seed_hash(seed);
65
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
66
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
67
+ uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
68
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
69
+ if (is_empty) {
70
+ return {true, true, seed_hash, 0, theta, nullptr};
71
+ }
72
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
73
+ const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
74
+ if (size < expected_size_bytes) {
75
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
76
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
77
+ }
78
+ return {false, true, seed_hash, num_entries, theta, entries};
79
+ }
80
+ case 2: {
81
+ uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
82
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
83
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
84
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
85
+ if (preamble_size == 1) {
86
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
87
+ } else if (preamble_size == 2) {
88
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
89
+ if (num_entries == 0) {
90
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
91
+ } else {
92
+ const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
93
+ if (size < expected_size_bytes) {
94
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
95
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
96
+ }
97
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
98
+ return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
99
+ }
100
+ } else if (preamble_size == 3) {
101
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
102
+ uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
103
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
104
+ if (is_empty) {
105
+ return {true, true, seed_hash, 0, theta, nullptr};
106
+ }
107
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
108
+ const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
109
+ if (size < expected_size_bytes) {
110
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
111
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
112
+ }
113
+ return {false, true, seed_hash, num_entries, theta, entries};
114
+ } else {
115
+ throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
116
+ }
117
+ }
118
+ default:
119
+ // this should always fail since the valid cases are handled above
120
+ checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
121
+ // this throw is never reached, because check_serial_version will throw an informative exception.
122
+ // This is only here to avoid a compiler warning about a path without a return value.
123
+ throw std::invalid_argument("unexpected sketch serialization version");
124
+ }
125
+ }
126
+
127
+ template<bool dummy>
128
+ std::string compact_theta_sketch_parser<dummy>::hex_dump(const uint8_t* ptr, size_t size) {
129
+ std::stringstream s;
130
+ s << std::hex << std::setfill('0') << std::uppercase;
131
+ for (size_t i = 0; i < size; ++i) s << std::setw(2) << (ptr[i] & 0xff);
132
+ return s.str();
133
+ }
134
+
135
+ } /* namespace datasketches */
136
+
137
+ #endif
@@ -21,14 +21,19 @@
21
21
  #define THETA_CONSTANTS_HPP_
22
22
 
23
23
  #include <climits>
24
+ #include "common_defs.hpp"
24
25
 
25
26
  namespace datasketches {
26
27
 
27
28
  namespace theta_constants {
28
- enum resize_factor { X1, X2, X4, X8 };
29
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
30
- static const uint8_t MIN_LG_K = 5;
31
- static const uint8_t MAX_LG_K = 26;
29
+ using resize_factor = datasketches::resize_factor;
30
+ //enum resize_factor { X1, X2, X4, X8 };
31
+ const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
32
+ const uint8_t MIN_LG_K = 5;
33
+ const uint8_t MAX_LG_K = 26;
34
+
35
+ const uint8_t DEFAULT_LG_K = 12;
36
+ const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
32
37
  }
33
38
 
34
39
  } /* namespace datasketches */