datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -30,8 +30,8 @@ namespace datasketches {
|
|
|
30
30
|
template<typename T, typename S, typename A>
|
|
31
31
|
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
|
32
32
|
n_(0),
|
|
33
|
-
outer_tau_numer_(0),
|
|
34
|
-
outer_tau_denom_(0
|
|
33
|
+
outer_tau_numer_(0.0),
|
|
34
|
+
outer_tau_denom_(0),
|
|
35
35
|
max_k_(max_k),
|
|
36
36
|
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
|
37
37
|
{}
|
|
@@ -129,16 +129,11 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
|
|
129
129
|
|
|
130
130
|
template<typename T, typename S, typename A>
|
|
131
131
|
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
uint8_t
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
is.read((char*)&family_id, sizeof(family_id));
|
|
138
|
-
uint8_t flags;
|
|
139
|
-
is.read((char*)&flags, sizeof(flags));
|
|
140
|
-
uint32_t max_k;
|
|
141
|
-
is.read((char*)&max_k, sizeof(max_k));
|
|
132
|
+
const auto preamble_longs = read<uint8_t>(is);
|
|
133
|
+
const auto serial_version = read<uint8_t>(is);
|
|
134
|
+
const auto family_id = read<uint8_t>(is);
|
|
135
|
+
const auto flags = read<uint8_t>(is);
|
|
136
|
+
const auto max_k = read<uint32_t>(is);
|
|
142
137
|
|
|
143
138
|
check_preamble_longs(preamble_longs, flags);
|
|
144
139
|
check_family_and_serialization_version(family_id, serial_version);
|
|
@@ -156,12 +151,9 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
|
|
|
156
151
|
return var_opt_union<T,S,A>(max_k);
|
|
157
152
|
}
|
|
158
153
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
|
|
163
|
-
uint64_t outer_tau_denom;
|
|
164
|
-
is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
|
|
154
|
+
const auto items_seen = read<uint64_t>(is);
|
|
155
|
+
const auto outer_tau_numer = read<double>(is);
|
|
156
|
+
const auto outer_tau_denom = read<uint64_t>(is);
|
|
165
157
|
|
|
166
158
|
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
|
|
167
159
|
|
|
@@ -176,15 +168,15 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
|
176
168
|
ensure_minimum_memory(size, 8);
|
|
177
169
|
const char* ptr = static_cast<const char*>(bytes);
|
|
178
170
|
uint8_t preamble_longs;
|
|
179
|
-
ptr += copy_from_mem(ptr,
|
|
171
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
|
180
172
|
uint8_t serial_version;
|
|
181
|
-
ptr += copy_from_mem(ptr,
|
|
173
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
182
174
|
uint8_t family_id;
|
|
183
|
-
ptr += copy_from_mem(ptr,
|
|
175
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
184
176
|
uint8_t flags;
|
|
185
|
-
ptr += copy_from_mem(ptr,
|
|
177
|
+
ptr += copy_from_mem(ptr, flags);
|
|
186
178
|
uint32_t max_k;
|
|
187
|
-
ptr += copy_from_mem(ptr,
|
|
179
|
+
ptr += copy_from_mem(ptr, max_k);
|
|
188
180
|
|
|
189
181
|
check_preamble_longs(preamble_longs, flags);
|
|
190
182
|
check_family_and_serialization_version(family_id, serial_version);
|
|
@@ -200,11 +192,11 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
|
200
192
|
}
|
|
201
193
|
|
|
202
194
|
uint64_t items_seen;
|
|
203
|
-
ptr += copy_from_mem(ptr,
|
|
195
|
+
ptr += copy_from_mem(ptr, items_seen);
|
|
204
196
|
double outer_tau_numer;
|
|
205
|
-
ptr += copy_from_mem(ptr,
|
|
197
|
+
ptr += copy_from_mem(ptr, outer_tau_numer);
|
|
206
198
|
uint64_t outer_tau_denom;
|
|
207
|
-
ptr += copy_from_mem(ptr,
|
|
199
|
+
ptr += copy_from_mem(ptr, outer_tau_denom);
|
|
208
200
|
|
|
209
201
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
|
210
202
|
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
|
|
@@ -238,16 +230,16 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
|
|
|
238
230
|
flags = 0;
|
|
239
231
|
}
|
|
240
232
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
233
|
+
write(os, preamble_longs);
|
|
234
|
+
write(os, serialization_version);
|
|
235
|
+
write(os, family_id);
|
|
236
|
+
write(os, flags);
|
|
237
|
+
write(os, max_k_);
|
|
246
238
|
|
|
247
239
|
if (!empty) {
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
240
|
+
write(os, n_);
|
|
241
|
+
write(os, outer_tau_numer_);
|
|
242
|
+
write(os, outer_tau_denom_);
|
|
251
243
|
gadget_.serialize(os);
|
|
252
244
|
}
|
|
253
245
|
}
|
|
@@ -275,16 +267,16 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
|
|
|
275
267
|
}
|
|
276
268
|
|
|
277
269
|
// first prelong
|
|
278
|
-
ptr += copy_to_mem(
|
|
279
|
-
ptr += copy_to_mem(
|
|
280
|
-
ptr += copy_to_mem(
|
|
281
|
-
ptr += copy_to_mem(
|
|
282
|
-
ptr += copy_to_mem(
|
|
270
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
|
271
|
+
ptr += copy_to_mem(serialization_version, ptr);
|
|
272
|
+
ptr += copy_to_mem(family_id, ptr);
|
|
273
|
+
ptr += copy_to_mem(flags, ptr);
|
|
274
|
+
ptr += copy_to_mem(max_k_, ptr);
|
|
283
275
|
|
|
284
276
|
if (!empty) {
|
|
285
|
-
ptr += copy_to_mem(
|
|
286
|
-
ptr += copy_to_mem(
|
|
287
|
-
ptr += copy_to_mem(
|
|
277
|
+
ptr += copy_to_mem(n_, ptr);
|
|
278
|
+
ptr += copy_to_mem(outer_tau_numer_, ptr);
|
|
279
|
+
ptr += copy_to_mem(outer_tau_denom_, ptr);
|
|
288
280
|
|
|
289
281
|
auto gadget_bytes = gadget_.serialize();
|
|
290
282
|
ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
|
|
@@ -303,14 +295,16 @@ void var_opt_union<T,S,A>::reset() {
|
|
|
303
295
|
|
|
304
296
|
template<typename T, typename S, typename A>
|
|
305
297
|
string<A> var_opt_union<T,S,A>::to_string() const {
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
298
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
|
299
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
|
300
|
+
std::ostringstream os;
|
|
301
|
+
os << "### VarOpt Union SUMMARY:" << std::endl;
|
|
302
|
+
os << " n : " << n_ << std::endl;
|
|
309
303
|
os << " Max k : " << max_k_ << std::endl;
|
|
310
|
-
os << " Gadget Summary:
|
|
304
|
+
os << " Gadget Summary:" << std::endl;
|
|
311
305
|
os << gadget_.to_string();
|
|
312
|
-
os << "### END VarOpt Union SUMMARY
|
|
313
|
-
return os.str();
|
|
306
|
+
os << "### END VarOpt Union SUMMARY" << std::endl;
|
|
307
|
+
return string<A>(os.str().c_str(), gadget_.allocator_);
|
|
314
308
|
}
|
|
315
309
|
|
|
316
310
|
template<typename T, typename S, typename A>
|
|
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
|
|
|
41
41
|
static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
|
42
42
|
var_opt_sketch<int> sk(k);
|
|
43
43
|
for (uint64_t i = 0; i < n; ++i) {
|
|
44
|
-
sk.update(i, 1.0);
|
|
44
|
+
sk.update(static_cast<int>(i), 1.0);
|
|
45
45
|
}
|
|
46
46
|
return sk;
|
|
47
47
|
}
|
|
@@ -71,7 +71,7 @@ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk
|
|
|
71
71
|
|
|
72
72
|
TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
|
|
73
73
|
REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
|
|
74
|
-
REQUIRE_THROWS_AS(var_opt_sketch<int>(
|
|
74
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>(1U << 31), std::invalid_argument); // aka k < 0
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
|
|
@@ -216,11 +216,11 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
|
|
|
216
216
|
// which covers about 10 orders of magnitude
|
|
217
217
|
double w = std::exp(5 * N(rand));
|
|
218
218
|
input_sum += w;
|
|
219
|
-
sk.update(i, w);
|
|
219
|
+
sk.update(static_cast<int>(i), w);
|
|
220
220
|
}
|
|
221
221
|
|
|
222
222
|
double output_sum = 0.0;
|
|
223
|
-
for (auto
|
|
223
|
+
for (auto it : sk) { // std::pair<int, weight>
|
|
224
224
|
output_sum += it.second;
|
|
225
225
|
}
|
|
226
226
|
|
|
@@ -350,7 +350,7 @@ TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
|
|
|
350
350
|
// Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
|
|
351
351
|
// added k-1 heavy items, leaving only 1 item left in R
|
|
352
352
|
for (uint32_t i = 1; i <= k; ++i) {
|
|
353
|
-
sk.update(-i, k + (i * wt_scale));
|
|
353
|
+
sk.update(-1 * static_cast<int>(i), k + (i * wt_scale));
|
|
354
354
|
}
|
|
355
355
|
|
|
356
356
|
auto it = sk.begin();
|
|
@@ -442,7 +442,7 @@ TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
|
|
|
442
442
|
// finally, a non-degenerate predicate
|
|
443
443
|
// insert negative items with identical weights, filter for negative weights only
|
|
444
444
|
for (uint32_t i = 1; i <= (k + 1); ++i) {
|
|
445
|
-
sk.update(static_cast<int32_t>(
|
|
445
|
+
sk.update(-1 * static_cast<int32_t>(i), static_cast<double>(i));
|
|
446
446
|
total_weight += 1.0 * i;
|
|
447
447
|
}
|
|
448
448
|
|
|
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
|
|
|
41
41
|
static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
|
42
42
|
var_opt_sketch<int> sk(k);
|
|
43
43
|
for (uint64_t i = 0; i < n; ++i) {
|
|
44
|
-
sk.update(i, 1.0);
|
|
44
|
+
sk.update(static_cast<int>(i), 1.0);
|
|
45
45
|
}
|
|
46
46
|
return sk;
|
|
47
47
|
}
|
|
@@ -147,7 +147,7 @@ TEST_CASE("varopt union: bad serialization version", "[var_opt_union]") {
|
|
|
147
147
|
|
|
148
148
|
TEST_CASE("varopt union: invalid k", "[var_opt_union]") {
|
|
149
149
|
REQUIRE_THROWS_AS(var_opt_union<int>(0), std::invalid_argument);
|
|
150
|
-
REQUIRE_THROWS_AS(var_opt_union<int>(
|
|
150
|
+
REQUIRE_THROWS_AS(var_opt_union<int>(1U << 31), std::invalid_argument);
|
|
151
151
|
}
|
|
152
152
|
|
|
153
153
|
TEST_CASE("varopt union: bad family", "[var_opt_union]") {
|
|
@@ -179,13 +179,13 @@ TEST_CASE("varopt union: empty union", "[var_opt_union]") {
|
|
|
179
179
|
}
|
|
180
180
|
|
|
181
181
|
TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
|
|
182
|
-
|
|
182
|
+
int n = 4; // 2n < k
|
|
183
183
|
uint32_t k = 10;
|
|
184
184
|
var_opt_sketch<int> sk1(k), sk2(k);
|
|
185
185
|
|
|
186
|
-
for (
|
|
187
|
-
sk1.update(i, i);
|
|
188
|
-
sk2.update(static_cast<
|
|
186
|
+
for (int i = 1; i <= n; ++i) {
|
|
187
|
+
sk1.update(i, static_cast<double>(i));
|
|
188
|
+
sk2.update(-i, static_cast<double>(i));
|
|
189
189
|
}
|
|
190
190
|
|
|
191
191
|
var_opt_union<int> u(k);
|
|
@@ -193,7 +193,7 @@ TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
|
|
|
193
193
|
u.update(sk2);
|
|
194
194
|
|
|
195
195
|
var_opt_sketch<int> result = u.get_result();
|
|
196
|
-
REQUIRE(result.get_n() ==
|
|
196
|
+
REQUIRE(result.get_n() == 2ULL * n);
|
|
197
197
|
REQUIRE(result.get_k() == k);
|
|
198
198
|
}
|
|
199
199
|
|
|
@@ -204,13 +204,13 @@ TEST_CASE("varopt union: heavy sampling sketch", "[var_opt_union]") {
|
|
|
204
204
|
uint32_t k2 = 5;
|
|
205
205
|
var_opt_sketch<int64_t> sk1(k1), sk2(k2);
|
|
206
206
|
for (uint64_t i = 1; i <= n1; ++i) {
|
|
207
|
-
sk1.update(i, i);
|
|
207
|
+
sk1.update(i, static_cast<double>(i));
|
|
208
208
|
}
|
|
209
209
|
|
|
210
210
|
for (uint64_t i = 1; i < n2; ++i) { // we'll add a very heavy one later
|
|
211
|
-
sk2.update(static_cast<int64_t>(
|
|
211
|
+
sk2.update(-1 * static_cast<int64_t>(i), i + 1000.0);
|
|
212
212
|
}
|
|
213
|
-
sk2.update(-n2, 1000000.0);
|
|
213
|
+
sk2.update(-1 * static_cast<int64_t>(n2), 1000000.0);
|
|
214
214
|
|
|
215
215
|
var_opt_union<int64_t> u(k1);
|
|
216
216
|
u.update(sk1);
|
|
@@ -258,15 +258,15 @@ TEST_CASE("varopt union: small sampling sketch", "[var_opt_union]") {
|
|
|
258
258
|
uint64_t n2 = 64;
|
|
259
259
|
|
|
260
260
|
var_opt_sketch<float> sk(k_small);
|
|
261
|
-
for (uint64_t i = 0; i < n1; ++i) { sk.update(i); }
|
|
262
|
-
sk.update(-1, n1 * n1); // add a heavy item
|
|
261
|
+
for (uint64_t i = 0; i < n1; ++i) { sk.update(static_cast<float>(i)); }
|
|
262
|
+
sk.update(-1.0f, static_cast<double>(n1 * n1)); // add a heavy item
|
|
263
263
|
|
|
264
264
|
var_opt_union<float> u(k_max);
|
|
265
265
|
u.update(sk);
|
|
266
266
|
|
|
267
267
|
// another one, but different n to get a different per-item weight
|
|
268
268
|
var_opt_sketch<float> sk2(k_small);
|
|
269
|
-
for (uint64_t i = 0; i < n2; ++i) { sk2.update(i); }
|
|
269
|
+
for (uint64_t i = 0; i < n2; ++i) { sk2.update(static_cast<float>(i)); }
|
|
270
270
|
u.update(sk2);
|
|
271
271
|
|
|
272
272
|
// should trigger migrate_marked_items_by_decreasing_k()
|
|
@@ -49,8 +49,9 @@ class CMakeBuild(build_ext):
|
|
|
49
49
|
os.path.dirname(self.get_ext_fullpath(ext.name)))
|
|
50
50
|
cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir]
|
|
51
51
|
cmake_args += ['-DWITH_PYTHON=True']
|
|
52
|
+
cmake_args += ['-DCMAKE_CXX_STANDARD=11']
|
|
52
53
|
# ensure we use a consistent python version
|
|
53
|
-
cmake_args += ['-
|
|
54
|
+
cmake_args += ['-DPython3_EXECUTABLE=' + sys.executable]
|
|
54
55
|
cfg = 'Debug' if self.debug else 'Release'
|
|
55
56
|
build_args = ['--config', cfg]
|
|
56
57
|
|
|
@@ -59,7 +60,8 @@ class CMakeBuild(build_ext):
|
|
|
59
60
|
cfg.upper(),
|
|
60
61
|
extdir)]
|
|
61
62
|
if sys.maxsize > 2**32:
|
|
62
|
-
cmake_args += ['-
|
|
63
|
+
cmake_args += ['-T', 'host=x64']
|
|
64
|
+
cmake_args += ['-DCMAKE_GENERATOR_PLATFORM=x64']
|
|
63
65
|
build_args += ['--', '/m']
|
|
64
66
|
else:
|
|
65
67
|
cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
|
|
@@ -74,23 +76,24 @@ class CMakeBuild(build_ext):
|
|
|
74
76
|
subprocess.check_call(['cmake', ext.sourcedir] + cmake_args,
|
|
75
77
|
cwd=self.build_temp, env=env)
|
|
76
78
|
subprocess.check_call(['cmake', '--build', '.', '--target', 'python'] + build_args,
|
|
77
|
-
cwd=self.build_temp)
|
|
79
|
+
cwd=self.build_temp, env=env)
|
|
78
80
|
print() # add an empty line to pretty print
|
|
79
81
|
|
|
80
82
|
setup(
|
|
81
83
|
name='datasketches',
|
|
82
|
-
version='3.
|
|
83
|
-
author='Apache
|
|
84
|
+
version='3.3.0',
|
|
85
|
+
author='Apache Software Foundation',
|
|
84
86
|
author_email='dev@datasketches.apache.org',
|
|
85
|
-
description='
|
|
87
|
+
description='The Apache DataSketches Library for Python',
|
|
86
88
|
license='Apache License 2.0',
|
|
87
89
|
url='http://datasketches.apache.org',
|
|
88
90
|
long_description=open('python/README.md').read(),
|
|
91
|
+
long_description_content_type='text/markdown',
|
|
89
92
|
packages=find_packages('python'), # python pacakges only in this dir
|
|
90
93
|
package_dir={'':'python'},
|
|
91
94
|
# may need to add all source paths for sdist packages w/o MANIFEST.in
|
|
92
95
|
ext_modules=[CMakeExtension('datasketches')],
|
|
93
96
|
cmdclass={'build_ext': CMakeBuild},
|
|
94
|
-
|
|
97
|
+
install_requires=['numpy'],
|
|
95
98
|
zip_safe=False
|
|
96
99
|
)
|
|
@@ -32,53 +32,34 @@ target_include_directories(theta
|
|
|
32
32
|
target_link_libraries(theta INTERFACE common)
|
|
33
33
|
target_compile_features(theta INTERFACE cxx_std_11)
|
|
34
34
|
|
|
35
|
-
set(theta_HEADERS "")
|
|
36
|
-
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
|
|
37
|
-
list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
|
|
38
|
-
list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
|
|
39
|
-
list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
|
|
40
|
-
list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
|
|
41
|
-
list(APPEND theta_HEADERS "include/theta_comparators.hpp")
|
|
42
|
-
list(APPEND theta_HEADERS "include/theta_constants.hpp")
|
|
43
|
-
list(APPEND theta_HEADERS "include/theta_helpers.hpp")
|
|
44
|
-
list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
|
|
45
|
-
list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
|
|
46
|
-
list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
|
|
47
|
-
list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
|
|
48
|
-
list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
|
|
49
|
-
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
|
|
50
|
-
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
|
|
51
|
-
|
|
52
35
|
install(TARGETS theta
|
|
53
36
|
EXPORT ${PROJECT_NAME}
|
|
54
37
|
)
|
|
55
38
|
|
|
56
|
-
install(FILES
|
|
39
|
+
install(FILES
|
|
40
|
+
include/theta_sketch.hpp
|
|
41
|
+
include/theta_sketch_impl.hpp
|
|
42
|
+
include/theta_union.hpp
|
|
43
|
+
include/theta_union_impl.hpp
|
|
44
|
+
include/theta_intersection.hpp
|
|
45
|
+
include/theta_intersection_impl.hpp
|
|
46
|
+
include/theta_a_not_b.hpp
|
|
47
|
+
include/theta_a_not_b_impl.hpp
|
|
48
|
+
include/theta_jaccard_similarity.hpp
|
|
49
|
+
include/theta_comparators.hpp
|
|
50
|
+
include/theta_constants.hpp
|
|
51
|
+
include/theta_helpers.hpp
|
|
52
|
+
include/theta_update_sketch_base.hpp
|
|
53
|
+
include/theta_update_sketch_base_impl.hpp
|
|
54
|
+
include/theta_union_base.hpp
|
|
55
|
+
include/theta_union_base_impl.hpp
|
|
56
|
+
include/theta_intersection_base.hpp
|
|
57
|
+
include/theta_intersection_base_impl.hpp
|
|
58
|
+
include/theta_set_difference_base.hpp
|
|
59
|
+
include/theta_set_difference_base_impl.hpp
|
|
60
|
+
include/theta_jaccard_similarity_base.hpp
|
|
61
|
+
include/bounds_on_ratios_in_sampled_sets.hpp
|
|
62
|
+
include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
|
63
|
+
include/compact_theta_sketch_parser.hpp
|
|
64
|
+
include/compact_theta_sketch_parser_impl.hpp
|
|
57
65
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
58
|
-
|
|
59
|
-
target_sources(theta
|
|
60
|
-
INTERFACE
|
|
61
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
|
|
62
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
|
|
63
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
|
|
64
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
|
|
65
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
|
|
66
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
|
|
67
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
|
|
68
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
|
|
69
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
|
|
70
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
|
|
71
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
|
|
72
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
|
|
73
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
|
|
74
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
|
|
75
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
|
|
76
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
|
|
77
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
|
|
78
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
|
|
79
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
|
|
80
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
|
|
81
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
|
|
82
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
|
|
83
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
|
84
|
-
)
|
|
@@ -90,7 +90,7 @@ public:
|
|
|
90
90
|
* @param f the inclusion probability used to produce the set with size <i>a</i>.
|
|
91
91
|
* @return the approximate lower bound
|
|
92
92
|
*/
|
|
93
|
-
static double estimate_of_a(uint64_t a,
|
|
93
|
+
static double estimate_of_a(uint64_t a, double f) {
|
|
94
94
|
check_inputs(a, 1, f);
|
|
95
95
|
return a / f;
|
|
96
96
|
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef COMPACT_THETA_SKETCH_PARSER_HPP_
|
|
21
|
+
#define COMPACT_THETA_SKETCH_PARSER_HPP_
|
|
22
|
+
|
|
23
|
+
#include <stdint.h>
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
template<bool dummy>
|
|
28
|
+
class compact_theta_sketch_parser {
|
|
29
|
+
public:
|
|
30
|
+
struct compact_theta_sketch_data {
|
|
31
|
+
bool is_empty;
|
|
32
|
+
bool is_ordered;
|
|
33
|
+
uint16_t seed_hash;
|
|
34
|
+
uint32_t num_entries;
|
|
35
|
+
uint64_t theta;
|
|
36
|
+
const uint64_t* entries;
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
static compact_theta_sketch_data parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error = false);
|
|
40
|
+
|
|
41
|
+
private:
|
|
42
|
+
// offsets are in sizeof(type)
|
|
43
|
+
static const size_t COMPACT_SKETCH_PRE_LONGS_BYTE = 0;
|
|
44
|
+
static const size_t COMPACT_SKETCH_SERIAL_VERSION_BYTE = 1;
|
|
45
|
+
static const size_t COMPACT_SKETCH_TYPE_BYTE = 2;
|
|
46
|
+
static const size_t COMPACT_SKETCH_FLAGS_BYTE = 5;
|
|
47
|
+
static const size_t COMPACT_SKETCH_SEED_HASH_U16 = 3;
|
|
48
|
+
static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2;
|
|
49
|
+
static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1;
|
|
50
|
+
static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2;
|
|
51
|
+
static const size_t COMPACT_SKETCH_THETA_U64 = 2;
|
|
52
|
+
static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3;
|
|
53
|
+
|
|
54
|
+
static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
|
|
55
|
+
static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
|
|
56
|
+
|
|
57
|
+
static const uint8_t COMPACT_SKETCH_SERIAL_VERSION = 3;
|
|
58
|
+
static const uint8_t COMPACT_SKETCH_TYPE = 3;
|
|
59
|
+
|
|
60
|
+
static std::string hex_dump(const uint8_t* ptr, size_t size);
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
} /* namespace datasketches */
|
|
64
|
+
|
|
65
|
+
#include "compact_theta_sketch_parser_impl.hpp"
|
|
66
|
+
|
|
67
|
+
#endif
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
|
|
21
|
+
#define COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <iostream>
|
|
24
|
+
#include <iomanip>
|
|
25
|
+
|
|
26
|
+
namespace datasketches {
|
|
27
|
+
|
|
28
|
+
template<bool dummy>
|
|
29
|
+
auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
|
|
30
|
+
if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
|
|
31
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
32
|
+
|
|
33
|
+
uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
|
|
34
|
+
|
|
35
|
+
switch(serial_version) {
|
|
36
|
+
case COMPACT_SKETCH_SERIAL_VERSION: {
|
|
37
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
38
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
|
39
|
+
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
|
40
|
+
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
|
|
41
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
|
42
|
+
}
|
|
43
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
44
|
+
const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
|
|
45
|
+
if (has_theta) {
|
|
46
|
+
if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
|
|
47
|
+
theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
48
|
+
}
|
|
49
|
+
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
|
|
50
|
+
return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
|
|
51
|
+
}
|
|
52
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
53
|
+
const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
|
54
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
|
|
55
|
+
const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
|
|
56
|
+
if (size < expected_size_bytes) {
|
|
57
|
+
throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
58
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
59
|
+
}
|
|
60
|
+
const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
|
|
61
|
+
return {false, is_ordered, seed_hash, num_entries, theta, entries};
|
|
62
|
+
}
|
|
63
|
+
case 1: {
|
|
64
|
+
uint16_t seed_hash = compute_seed_hash(seed);
|
|
65
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
66
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
67
|
+
uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
68
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
69
|
+
if (is_empty) {
|
|
70
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
|
71
|
+
}
|
|
72
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
|
73
|
+
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
|
74
|
+
if (size < expected_size_bytes) {
|
|
75
|
+
throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
76
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
77
|
+
}
|
|
78
|
+
return {false, true, seed_hash, num_entries, theta, entries};
|
|
79
|
+
}
|
|
80
|
+
case 2: {
|
|
81
|
+
uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
|
|
82
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
83
|
+
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
|
84
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
85
|
+
if (preamble_size == 1) {
|
|
86
|
+
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
|
|
87
|
+
} else if (preamble_size == 2) {
|
|
88
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
89
|
+
if (num_entries == 0) {
|
|
90
|
+
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
|
|
91
|
+
} else {
|
|
92
|
+
const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
|
|
93
|
+
if (size < expected_size_bytes) {
|
|
94
|
+
throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
95
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
96
|
+
}
|
|
97
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
|
98
|
+
return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
|
|
99
|
+
}
|
|
100
|
+
} else if (preamble_size == 3) {
|
|
101
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
102
|
+
uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
103
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
104
|
+
if (is_empty) {
|
|
105
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
|
106
|
+
}
|
|
107
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
|
108
|
+
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
|
109
|
+
if (size < expected_size_bytes) {
|
|
110
|
+
throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
111
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
112
|
+
}
|
|
113
|
+
return {false, true, seed_hash, num_entries, theta, entries};
|
|
114
|
+
} else {
|
|
115
|
+
throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
default:
|
|
119
|
+
// this should always fail since the valid cases are handled above
|
|
120
|
+
checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
|
|
121
|
+
// this throw is never reached, because check_serial_version will throw an informative exception.
|
|
122
|
+
// This is only here to avoid a compiler warning about a path without a return value.
|
|
123
|
+
throw std::invalid_argument("unexpected sketch serialization version");
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
template<bool dummy>
|
|
128
|
+
std::string compact_theta_sketch_parser<dummy>::hex_dump(const uint8_t* ptr, size_t size) {
|
|
129
|
+
std::stringstream s;
|
|
130
|
+
s << std::hex << std::setfill('0') << std::uppercase;
|
|
131
|
+
for (size_t i = 0; i < size; ++i) s << std::setw(2) << (ptr[i] & 0xff);
|
|
132
|
+
return s.str();
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
} /* namespace datasketches */
|
|
136
|
+
|
|
137
|
+
#endif
|
|
@@ -21,14 +21,19 @@
|
|
|
21
21
|
#define THETA_CONSTANTS_HPP_
|
|
22
22
|
|
|
23
23
|
#include <climits>
|
|
24
|
+
#include "common_defs.hpp"
|
|
24
25
|
|
|
25
26
|
namespace datasketches {
|
|
26
27
|
|
|
27
28
|
namespace theta_constants {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
using resize_factor = datasketches::resize_factor;
|
|
30
|
+
//enum resize_factor { X1, X2, X4, X8 };
|
|
31
|
+
const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
|
|
32
|
+
const uint8_t MIN_LG_K = 5;
|
|
33
|
+
const uint8_t MAX_LG_K = 26;
|
|
34
|
+
|
|
35
|
+
const uint8_t DEFAULT_LG_K = 12;
|
|
36
|
+
const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
|
|
32
37
|
}
|
|
33
38
|
|
|
34
39
|
} /* namespace datasketches */
|