datasketches 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
|
@@ -35,13 +35,13 @@ public:
|
|
|
35
35
|
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
|
36
36
|
using resize_factor = theta_constants::resize_factor;
|
|
37
37
|
|
|
38
|
-
struct
|
|
39
|
-
|
|
38
|
+
struct nop_policy {
|
|
39
|
+
void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
|
|
40
|
+
unused(internal_entry);
|
|
40
41
|
unused(incoming_entry);
|
|
41
|
-
return internal_entry;
|
|
42
42
|
}
|
|
43
43
|
};
|
|
44
|
-
using State = theta_union_base<Entry, ExtractKey,
|
|
44
|
+
using State = theta_union_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
|
|
45
45
|
|
|
46
46
|
// No constructor here. Use builder instead.
|
|
47
47
|
class builder;
|
|
@@ -43,7 +43,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
43
43
|
if (sketch.get_theta64() < union_theta_) union_theta_ = sketch.get_theta64();
|
|
44
44
|
for (auto& entry: sketch) {
|
|
45
45
|
const uint64_t hash = EK()(entry);
|
|
46
|
-
if (hash < union_theta_) {
|
|
46
|
+
if (hash < union_theta_ && hash < table_.theta_) {
|
|
47
47
|
auto result = table_.find(hash);
|
|
48
48
|
if (!result.second) {
|
|
49
49
|
table_.insert(result.first, conditional_forward<SS>(entry));
|
|
@@ -24,7 +24,7 @@ namespace datasketches {
|
|
|
24
24
|
|
|
25
25
|
template<typename A>
|
|
26
26
|
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
|
|
27
|
-
state_(lg_cur_size, lg_nom_size, rf, theta, seed,
|
|
27
|
+
state_(lg_cur_size, lg_nom_size, rf, theta, seed, nop_policy(), allocator)
|
|
28
28
|
{}
|
|
29
29
|
|
|
30
30
|
template<typename A>
|
|
@@ -53,6 +53,8 @@ struct theta_update_sketch_base {
|
|
|
53
53
|
inline uint64_t hash_and_screen(const void* data, size_t length);
|
|
54
54
|
|
|
55
55
|
inline std::pair<iterator, bool> find(uint64_t key) const;
|
|
56
|
+
static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
|
|
57
|
+
|
|
56
58
|
|
|
57
59
|
template<typename FwdEntry>
|
|
58
60
|
inline void insert(iterator it, FwdEntry&& entry);
|
|
@@ -39,7 +39,7 @@ seed_(seed),
|
|
|
39
39
|
entries_(nullptr)
|
|
40
40
|
{
|
|
41
41
|
if (lg_cur_size > 0) {
|
|
42
|
-
const size_t size =
|
|
42
|
+
const size_t size = 1ULL << lg_cur_size;
|
|
43
43
|
entries_ = allocator_.allocate(size);
|
|
44
44
|
for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
|
|
45
45
|
}
|
|
@@ -58,7 +58,7 @@ seed_(other.seed_),
|
|
|
58
58
|
entries_(nullptr)
|
|
59
59
|
{
|
|
60
60
|
if (other.entries_ != nullptr) {
|
|
61
|
-
const size_t size =
|
|
61
|
+
const size_t size = 1ULL << lg_cur_size_;
|
|
62
62
|
entries_ = allocator_.allocate(size);
|
|
63
63
|
for (size_t i = 0; i < size; ++i) {
|
|
64
64
|
if (EK()(other.entries_[i]) != 0) {
|
|
@@ -89,7 +89,7 @@ template<typename EN, typename EK, typename A>
|
|
|
89
89
|
theta_update_sketch_base<EN, EK, A>::~theta_update_sketch_base()
|
|
90
90
|
{
|
|
91
91
|
if (entries_ != nullptr) {
|
|
92
|
-
const size_t size =
|
|
92
|
+
const size_t size = 1ULL << lg_cur_size_;
|
|
93
93
|
for (size_t i = 0; i < size; ++i) {
|
|
94
94
|
if (EK()(entries_[i]) != 0) entries_[i].~EN();
|
|
95
95
|
}
|
|
@@ -136,18 +136,23 @@ uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data,
|
|
|
136
136
|
|
|
137
137
|
template<typename EN, typename EK, typename A>
|
|
138
138
|
auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
139
|
+
return find(entries_, lg_cur_size_, key);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
template<typename EN, typename EK, typename A>
|
|
143
|
+
auto theta_update_sketch_base<EN, EK, A>::find(EN* entries, uint8_t lg_size, uint64_t key) -> std::pair<iterator, bool> {
|
|
144
|
+
const uint32_t size = 1 << lg_size;
|
|
145
|
+
const uint32_t mask = size - 1;
|
|
146
|
+
const uint32_t stride = get_stride(key, lg_size);
|
|
142
147
|
uint32_t index = static_cast<uint32_t>(key) & mask;
|
|
143
148
|
// search for duplicate or zero
|
|
144
149
|
const uint32_t loop_index = index;
|
|
145
150
|
do {
|
|
146
|
-
const uint64_t probe = EK()(
|
|
151
|
+
const uint64_t probe = EK()(entries[index]);
|
|
147
152
|
if (probe == 0) {
|
|
148
|
-
return std::pair<iterator, bool>(&
|
|
153
|
+
return std::pair<iterator, bool>(&entries[index], false);
|
|
149
154
|
} else if (probe == key) {
|
|
150
|
-
return std::pair<iterator, bool>(&
|
|
155
|
+
return std::pair<iterator, bool>(&entries[index], true);
|
|
151
156
|
}
|
|
152
157
|
index = (index + stride) & mask;
|
|
153
158
|
} while (index != loop_index);
|
|
@@ -175,13 +180,13 @@ auto theta_update_sketch_base<EN, EK, A>::begin() const -> iterator {
|
|
|
175
180
|
|
|
176
181
|
template<typename EN, typename EK, typename A>
|
|
177
182
|
auto theta_update_sketch_base<EN, EK, A>::end() const -> iterator {
|
|
178
|
-
return &entries_[
|
|
183
|
+
return &entries_[1ULL << lg_cur_size_];
|
|
179
184
|
}
|
|
180
185
|
|
|
181
186
|
template<typename EN, typename EK, typename A>
|
|
182
187
|
uint32_t theta_update_sketch_base<EN, EK, A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
|
|
183
188
|
const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
|
|
184
|
-
return std::floor(fraction * (1 << lg_cur_size));
|
|
189
|
+
return static_cast<uint32_t>(std::floor(fraction * (1 << lg_cur_size)));
|
|
185
190
|
}
|
|
186
191
|
|
|
187
192
|
template<typename EN, typename EK, typename A>
|
|
@@ -192,29 +197,29 @@ uint32_t theta_update_sketch_base<EN, EK, A>::get_stride(uint64_t key, uint8_t l
|
|
|
192
197
|
|
|
193
198
|
template<typename EN, typename EK, typename A>
|
|
194
199
|
void theta_update_sketch_base<EN, EK, A>::resize() {
|
|
195
|
-
const size_t old_size =
|
|
196
|
-
const uint8_t
|
|
197
|
-
const
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
EN* old_entries = entries_;
|
|
201
|
-
entries_ = allocator_.allocate(new_size);
|
|
202
|
-
for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
|
|
203
|
-
num_entries_ = 0;
|
|
200
|
+
const size_t old_size = 1ULL << lg_cur_size_;
|
|
201
|
+
const uint8_t lg_new_size = std::min<uint8_t>(lg_cur_size_ + static_cast<uint8_t>(rf_), lg_nom_size_ + 1);
|
|
202
|
+
const size_t new_size = 1ULL << lg_new_size;
|
|
203
|
+
EN* new_entries = allocator_.allocate(new_size);
|
|
204
|
+
for (size_t i = 0; i < new_size; ++i) EK()(new_entries[i]) = 0;
|
|
204
205
|
for (size_t i = 0; i < old_size; ++i) {
|
|
205
|
-
const uint64_t key = EK()(
|
|
206
|
+
const uint64_t key = EK()(entries_[i]);
|
|
206
207
|
if (key != 0) {
|
|
207
|
-
|
|
208
|
-
|
|
208
|
+
// always finds an empty slot in a larger table
|
|
209
|
+
new (find(new_entries, lg_new_size, key).first) EN(std::move(entries_[i]));
|
|
210
|
+
entries_[i].~EN();
|
|
211
|
+
EK()(entries_[i]) = 0;
|
|
209
212
|
}
|
|
210
213
|
}
|
|
211
|
-
|
|
214
|
+
std::swap(entries_, new_entries);
|
|
215
|
+
lg_cur_size_ = lg_new_size;
|
|
216
|
+
allocator_.deallocate(new_entries, old_size);
|
|
212
217
|
}
|
|
213
218
|
|
|
214
219
|
// assumes number of entries > nominal size
|
|
215
220
|
template<typename EN, typename EK, typename A>
|
|
216
221
|
void theta_update_sketch_base<EN, EK, A>::rebuild() {
|
|
217
|
-
const size_t size =
|
|
222
|
+
const size_t size = 1ULL << lg_cur_size_;
|
|
218
223
|
const uint32_t nominal_size = 1 << lg_nom_size_;
|
|
219
224
|
|
|
220
225
|
// empty entries have uninitialized payloads
|
|
@@ -227,10 +232,10 @@ void theta_update_sketch_base<EN, EK, A>::rebuild() {
|
|
|
227
232
|
const size_t num_old_entries = num_entries_;
|
|
228
233
|
entries_ = allocator_.allocate(size);
|
|
229
234
|
for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
|
|
230
|
-
num_entries_ =
|
|
235
|
+
num_entries_ = nominal_size;
|
|
231
236
|
// relies on consolidating non-empty entries to the front
|
|
232
237
|
for (size_t i = 0; i < nominal_size; ++i) {
|
|
233
|
-
|
|
238
|
+
new (find(EK()(old_entries[i])).first) EN(std::move(old_entries[i]));
|
|
234
239
|
old_entries[i].~EN();
|
|
235
240
|
}
|
|
236
241
|
for (size_t i = nominal_size; i < num_old_entries; ++i) old_entries[i].~EN();
|
|
@@ -301,7 +306,7 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
|
|
|
301
306
|
|
|
302
307
|
template<typename Derived, typename Allocator>
|
|
303
308
|
uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
|
|
304
|
-
if (p_ < 1) return theta_constants::MAX_THETA * p_;
|
|
309
|
+
if (p_ < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p_);
|
|
305
310
|
return theta_constants::MAX_THETA;
|
|
306
311
|
}
|
|
307
312
|
|
|
@@ -37,7 +37,7 @@ TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
|
|
|
37
37
|
TEST_CASE("theta a-not-b: non empty no retained keys", "[theta_a_not_b]") {
|
|
38
38
|
update_theta_sketch a = update_theta_sketch::builder().build();
|
|
39
39
|
a.update(1);
|
|
40
|
-
update_theta_sketch b = update_theta_sketch::builder().set_p(0.
|
|
40
|
+
update_theta_sketch b = update_theta_sketch::builder().set_p(0.001f).build();
|
|
41
41
|
theta_a_not_b a_not_b;
|
|
42
42
|
|
|
43
43
|
// B is still empty
|
|
@@ -167,6 +167,28 @@ TEST_CASE("theta a-not-b: estimation mode half overlap", "[theta_a_not_b]") {
|
|
|
167
167
|
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
168
168
|
}
|
|
169
169
|
|
|
170
|
+
TEST_CASE("theta a-not-b: estimation mode half overlap wrapped compact", "[theta_a_not_b]") {
|
|
171
|
+
update_theta_sketch a = update_theta_sketch::builder().build();
|
|
172
|
+
int value = 0;
|
|
173
|
+
for (int i = 0; i < 10000; i++) a.update(value++);
|
|
174
|
+
auto bytes_a = a.compact().serialize();
|
|
175
|
+
|
|
176
|
+
update_theta_sketch b = update_theta_sketch::builder().build();
|
|
177
|
+
value = 5000;
|
|
178
|
+
for (int i = 0; i < 10000; i++) b.update(value++);
|
|
179
|
+
auto bytes_b = b.compact().serialize();
|
|
180
|
+
|
|
181
|
+
theta_a_not_b a_not_b;
|
|
182
|
+
|
|
183
|
+
auto result = a_not_b.compute(
|
|
184
|
+
wrapped_compact_theta_sketch::wrap(bytes_a.data(), bytes_a.size()),
|
|
185
|
+
wrapped_compact_theta_sketch::wrap(bytes_b.data(), bytes_b.size())
|
|
186
|
+
);
|
|
187
|
+
REQUIRE_FALSE(result.is_empty());
|
|
188
|
+
REQUIRE(result.is_estimation_mode());
|
|
189
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
190
|
+
}
|
|
191
|
+
|
|
170
192
|
TEST_CASE("theta a-not-b: estimation mode disjoint", "[theta_a_not_b]") {
|
|
171
193
|
update_theta_sketch a = update_theta_sketch::builder().build();
|
|
172
194
|
int value = 0;
|
|
@@ -48,7 +48,7 @@ TEST_CASE("theta intersection: empty", "[theta_intersection]") {
|
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
TEST_CASE("theta intersection: non empty no retained keys", "[theta_intersection]") {
|
|
51
|
-
update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.
|
|
51
|
+
update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001f).build();
|
|
52
52
|
sketch.update(1);
|
|
53
53
|
theta_intersection intersection;
|
|
54
54
|
intersection.update(sketch);
|
|
@@ -174,6 +174,26 @@ TEST_CASE("theta intersection: estimation mode half overlap ordered", "[theta_in
|
|
|
174
174
|
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
175
175
|
}
|
|
176
176
|
|
|
177
|
+
TEST_CASE("theta intersection: estimation mode half overlap ordered wrapped compact", "[theta_intersection]") {
|
|
178
|
+
update_theta_sketch sketch1 = update_theta_sketch::builder().build();
|
|
179
|
+
int value = 0;
|
|
180
|
+
for (int i = 0; i < 10000; i++) sketch1.update(value++);
|
|
181
|
+
auto bytes1 = sketch1.compact().serialize();
|
|
182
|
+
|
|
183
|
+
update_theta_sketch sketch2 = update_theta_sketch::builder().build();
|
|
184
|
+
value = 5000;
|
|
185
|
+
for (int i = 0; i < 10000; i++) sketch2.update(value++);
|
|
186
|
+
auto bytes2 = sketch2.compact().serialize();
|
|
187
|
+
|
|
188
|
+
theta_intersection intersection;
|
|
189
|
+
intersection.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
|
|
190
|
+
intersection.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
|
|
191
|
+
compact_theta_sketch result = intersection.get_result();
|
|
192
|
+
REQUIRE_FALSE(result.is_empty());
|
|
193
|
+
REQUIRE(result.is_estimation_mode());
|
|
194
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
195
|
+
}
|
|
196
|
+
|
|
177
197
|
TEST_CASE("theta intersection: estimation mode disjoint unordered", "[theta_intersection]") {
|
|
178
198
|
update_theta_sketch sketch1 = update_theta_sketch::builder().build();
|
|
179
199
|
int value = 0;
|
|
@@ -100,6 +100,28 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
|
|
|
100
100
|
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
|
101
101
|
}
|
|
102
102
|
|
|
103
|
+
TEST_CASE("theta jaccard: half overlap estimation mode custom seed", "[theta_sketch]") {
|
|
104
|
+
const uint64_t seed = 123;
|
|
105
|
+
auto sk_a = update_theta_sketch::builder().set_seed(seed).build();
|
|
106
|
+
auto sk_b = update_theta_sketch::builder().set_seed(seed).build();
|
|
107
|
+
for (int i = 0; i < 10000; ++i) {
|
|
108
|
+
sk_a.update(i);
|
|
109
|
+
sk_b.update(i + 5000);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// update sketches
|
|
113
|
+
auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b, seed);
|
|
114
|
+
REQUIRE(jc[0] == Approx(0.33).margin(0.01));
|
|
115
|
+
REQUIRE(jc[1] == Approx(0.33).margin(0.01));
|
|
116
|
+
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
|
117
|
+
|
|
118
|
+
// compact sketches
|
|
119
|
+
jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact(), seed);
|
|
120
|
+
REQUIRE(jc[0] == Approx(0.33).margin(0.01));
|
|
121
|
+
REQUIRE(jc[1] == Approx(0.33).margin(0.01));
|
|
122
|
+
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
|
123
|
+
}
|
|
124
|
+
|
|
103
125
|
/**
|
|
104
126
|
* The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
|
|
105
127
|
* underlying sketch is about +/- 1.56%.
|
|
@@ -107,7 +129,7 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
|
|
|
107
129
|
TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
108
130
|
const int8_t min_lg_k = 12;
|
|
109
131
|
const int u1 = 1 << 20;
|
|
110
|
-
const int u2 = u1 * 0.95;
|
|
132
|
+
const int u2 = static_cast<int>(u1 * 0.95);
|
|
111
133
|
const double threshold = 0.943;
|
|
112
134
|
|
|
113
135
|
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
|
@@ -120,6 +142,23 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
|
120
142
|
REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
|
|
121
143
|
}
|
|
122
144
|
|
|
145
|
+
TEST_CASE("theta jaccard: similarity test custom seed", "[theta_sketch]") {
|
|
146
|
+
const int8_t min_lg_k = 12;
|
|
147
|
+
const int u1 = 1 << 20;
|
|
148
|
+
const int u2 = static_cast<int>(u1 * 0.95);
|
|
149
|
+
const double threshold = 0.943;
|
|
150
|
+
const uint64_t seed = 1234;
|
|
151
|
+
|
|
152
|
+
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
|
153
|
+
for (int i = 0; i < u1; ++i) expected.update(i);
|
|
154
|
+
|
|
155
|
+
auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
|
156
|
+
for (int i = 0; i < u2; ++i) actual.update(i);
|
|
157
|
+
|
|
158
|
+
REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold, seed));
|
|
159
|
+
REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold, seed));
|
|
160
|
+
}
|
|
161
|
+
|
|
123
162
|
/**
|
|
124
163
|
* The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
|
|
125
164
|
* as the ratio of intersection to the union becomes a small number.
|
|
@@ -127,7 +166,7 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
|
127
166
|
TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
|
|
128
167
|
const int8_t min_lg_k = 12;
|
|
129
168
|
const int u1 = 1 << 20;
|
|
130
|
-
const int u2 = u1 * 0.05;
|
|
169
|
+
const int u2 = static_cast<int>(u1 * 0.05);
|
|
131
170
|
const double threshold = 0.061;
|
|
132
171
|
|
|
133
172
|
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
|
@@ -140,4 +179,21 @@ TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
|
|
|
140
179
|
REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
|
|
141
180
|
}
|
|
142
181
|
|
|
182
|
+
TEST_CASE("theta jaccard: dissimilarity test custom seed", "[theta_sketch]") {
|
|
183
|
+
const int8_t min_lg_k = 12;
|
|
184
|
+
const int u1 = 1 << 20;
|
|
185
|
+
const int u2 = static_cast<int>(u1 * 0.05);
|
|
186
|
+
const double threshold = 0.061;
|
|
187
|
+
const uint64_t seed = 1234;
|
|
188
|
+
|
|
189
|
+
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
|
190
|
+
for (int i = 0; i < u1; ++i) expected.update(i);
|
|
191
|
+
|
|
192
|
+
auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
|
193
|
+
for (int i = 0; i < u2; ++i) actual.update(i);
|
|
194
|
+
|
|
195
|
+
REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold, seed));
|
|
196
|
+
REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold, seed));
|
|
197
|
+
}
|
|
198
|
+
|
|
143
199
|
} /* namespace datasketches */
|
|
@@ -50,7 +50,7 @@ TEST_CASE("theta sketch: empty", "[theta_sketch]") {
|
|
|
50
50
|
}
|
|
51
51
|
|
|
52
52
|
TEST_CASE("theta sketch: non empty no retained keys", "[theta_sketch]") {
|
|
53
|
-
update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.
|
|
53
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001f).build();
|
|
54
54
|
update_sketch.update(1);
|
|
55
55
|
//std::cerr << update_sketch.to_string();
|
|
56
56
|
REQUIRE(update_sketch.get_num_retained() == 0);
|
|
@@ -238,4 +238,40 @@ TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[thet
|
|
|
238
238
|
REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
239
239
|
}
|
|
240
240
|
|
|
241
|
+
TEST_CASE("theta sketch: conversion constructor and wrapped compact", "[theta_sketch]") {
|
|
242
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
243
|
+
const int n = 8192;
|
|
244
|
+
for (int i = 0; i < n; i++) update_sketch.update(i);
|
|
245
|
+
|
|
246
|
+
// unordered
|
|
247
|
+
auto unordered_compact1 = update_sketch.compact(false);
|
|
248
|
+
compact_theta_sketch unordered_compact2(update_sketch, false);
|
|
249
|
+
auto it = unordered_compact1.begin();
|
|
250
|
+
for (auto entry: unordered_compact2) {
|
|
251
|
+
REQUIRE(*it == entry);
|
|
252
|
+
++it;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// ordered
|
|
256
|
+
auto ordered_compact1 = update_sketch.compact();
|
|
257
|
+
compact_theta_sketch ordered_compact2(update_sketch, true);
|
|
258
|
+
it = ordered_compact1.begin();
|
|
259
|
+
for (auto entry: ordered_compact2) {
|
|
260
|
+
REQUIRE(*it == entry);
|
|
261
|
+
++it;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// wrapped compact
|
|
265
|
+
auto bytes = ordered_compact1.serialize();
|
|
266
|
+
auto ordered_compact3 = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
|
|
267
|
+
it = ordered_compact1.begin();
|
|
268
|
+
for (auto entry: ordered_compact3) {
|
|
269
|
+
REQUIRE(*it == entry);
|
|
270
|
+
++it;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// seed mismatch
|
|
274
|
+
REQUIRE_THROWS_AS(wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), 0), std::invalid_argument);
|
|
275
|
+
}
|
|
276
|
+
|
|
241
277
|
} /* namespace datasketches */
|
|
@@ -39,7 +39,7 @@ TEST_CASE("theta union: empty", "[theta_union]") {
|
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
TEST_CASE("theta union: non empty no retained keys", "[theta_union]") {
|
|
42
|
-
update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.
|
|
42
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001f).build();
|
|
43
43
|
update_sketch.update(1);
|
|
44
44
|
theta_union u = theta_union::builder().build();
|
|
45
45
|
u.update(update_sketch);
|
|
@@ -65,7 +65,27 @@ TEST_CASE("theta union: exact mode half overlap", "[theta_union]") {
|
|
|
65
65
|
compact_theta_sketch sketch3 = u.get_result();
|
|
66
66
|
REQUIRE_FALSE(sketch3.is_empty());
|
|
67
67
|
REQUIRE_FALSE(sketch3.is_estimation_mode());
|
|
68
|
-
REQUIRE(sketch3.get_estimate() ==
|
|
68
|
+
REQUIRE(sketch3.get_estimate() == 1500.0);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
TEST_CASE("theta union: exact mode half overlap wrapped compact", "[theta_union]") {
|
|
72
|
+
update_theta_sketch sketch1 = update_theta_sketch::builder().build();
|
|
73
|
+
int value = 0;
|
|
74
|
+
for (int i = 0; i < 1000; i++) sketch1.update(value++);
|
|
75
|
+
auto bytes1 = sketch1.compact().serialize();
|
|
76
|
+
|
|
77
|
+
update_theta_sketch sketch2 = update_theta_sketch::builder().build();
|
|
78
|
+
value = 500;
|
|
79
|
+
for (int i = 0; i < 1000; i++) sketch2.update(value++);
|
|
80
|
+
auto bytes2 = sketch2.compact().serialize();
|
|
81
|
+
|
|
82
|
+
theta_union u = theta_union::builder().build();
|
|
83
|
+
u.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
|
|
84
|
+
u.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
|
|
85
|
+
compact_theta_sketch sketch3 = u.get_result();
|
|
86
|
+
REQUIRE_FALSE(sketch3.is_empty());
|
|
87
|
+
REQUIRE_FALSE(sketch3.is_estimation_mode());
|
|
88
|
+
REQUIRE(sketch3.get_estimate() == 1500.0);
|
|
69
89
|
}
|
|
70
90
|
|
|
71
91
|
TEST_CASE("theta union: estimation mode half overlap", "[theta_union]") {
|
|
@@ -70,33 +70,33 @@ uint8_t compact_array_of_doubles_sketch_alloc<A>::get_num_values() const {
|
|
|
70
70
|
template<typename A>
|
|
71
71
|
void compact_array_of_doubles_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
72
72
|
const uint8_t preamble_longs = 1;
|
|
73
|
-
|
|
73
|
+
write(os, preamble_longs);
|
|
74
74
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
75
|
-
|
|
75
|
+
write(os, serial_version);
|
|
76
76
|
const uint8_t family = SKETCH_FAMILY;
|
|
77
|
-
|
|
77
|
+
write(os, family);
|
|
78
78
|
const uint8_t type = SKETCH_TYPE;
|
|
79
|
-
|
|
79
|
+
write(os, type);
|
|
80
80
|
const uint8_t flags_byte(
|
|
81
81
|
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
82
82
|
(this->get_num_retained() > 0 ? 1 << flags::HAS_ENTRIES : 0) |
|
|
83
83
|
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
84
84
|
);
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
write(os, flags_byte);
|
|
86
|
+
write(os, num_values_);
|
|
87
87
|
const uint16_t seed_hash = this->get_seed_hash();
|
|
88
|
-
|
|
89
|
-
|
|
88
|
+
write(os, seed_hash);
|
|
89
|
+
write(os, this->theta_);
|
|
90
90
|
if (this->get_num_retained() > 0) {
|
|
91
|
-
const uint32_t num_entries = this->entries_.size();
|
|
92
|
-
|
|
91
|
+
const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
|
|
92
|
+
write(os, num_entries);
|
|
93
93
|
const uint32_t unused32 = 0;
|
|
94
|
-
|
|
94
|
+
write(os, unused32);
|
|
95
95
|
for (const auto& it: this->entries_) {
|
|
96
|
-
|
|
96
|
+
write(os, it.first);
|
|
97
97
|
}
|
|
98
98
|
for (const auto& it: this->entries_) {
|
|
99
|
-
|
|
99
|
+
write(os, it.second.data(), it.second.size() * sizeof(double));
|
|
100
100
|
}
|
|
101
101
|
}
|
|
102
102
|
}
|
|
@@ -110,30 +110,29 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
|
|
|
110
110
|
vector_bytes bytes(size, 0, this->entries_.get_allocator());
|
|
111
111
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
112
112
|
|
|
113
|
-
ptr += copy_to_mem(
|
|
113
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
|
114
114
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
115
|
-
ptr += copy_to_mem(
|
|
115
|
+
ptr += copy_to_mem(serial_version, ptr);
|
|
116
116
|
const uint8_t family = SKETCH_FAMILY;
|
|
117
|
-
ptr += copy_to_mem(
|
|
117
|
+
ptr += copy_to_mem(family, ptr);
|
|
118
118
|
const uint8_t type = SKETCH_TYPE;
|
|
119
|
-
ptr += copy_to_mem(
|
|
119
|
+
ptr += copy_to_mem(type, ptr);
|
|
120
120
|
const uint8_t flags_byte(
|
|
121
121
|
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
122
122
|
(this->get_num_retained() ? 1 << flags::HAS_ENTRIES : 0) |
|
|
123
123
|
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
124
124
|
);
|
|
125
|
-
ptr += copy_to_mem(
|
|
126
|
-
ptr += copy_to_mem(
|
|
125
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
126
|
+
ptr += copy_to_mem(num_values_, ptr);
|
|
127
127
|
const uint16_t seed_hash = this->get_seed_hash();
|
|
128
|
-
ptr += copy_to_mem(
|
|
129
|
-
ptr += copy_to_mem(
|
|
128
|
+
ptr += copy_to_mem(seed_hash, ptr);
|
|
129
|
+
ptr += copy_to_mem((this->theta_), ptr);
|
|
130
130
|
if (this->get_num_retained() > 0) {
|
|
131
|
-
const uint32_t num_entries = this->entries_.size();
|
|
132
|
-
ptr += copy_to_mem(
|
|
133
|
-
|
|
134
|
-
ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
|
|
131
|
+
const uint32_t num_entries = static_cast<uint32_t>(this->entries_.size());
|
|
132
|
+
ptr += copy_to_mem(num_entries, ptr);
|
|
133
|
+
ptr += sizeof(uint32_t); // unused
|
|
135
134
|
for (const auto& it: this->entries_) {
|
|
136
|
-
ptr += copy_to_mem(
|
|
135
|
+
ptr += copy_to_mem(it.first, ptr);
|
|
137
136
|
}
|
|
138
137
|
for (const auto& it: this->entries_) {
|
|
139
138
|
ptr += copy_to_mem(it.second.data(), ptr, it.second.size() * sizeof(double));
|
|
@@ -144,40 +143,30 @@ auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_by
|
|
|
144
143
|
|
|
145
144
|
template<typename A>
|
|
146
145
|
compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
|
|
147
|
-
uint8_t
|
|
148
|
-
|
|
149
|
-
uint8_t
|
|
150
|
-
|
|
151
|
-
uint8_t
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
is.read(reinterpret_cast<char*>(&type), sizeof(type));
|
|
155
|
-
uint8_t flags_byte;
|
|
156
|
-
is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
|
|
157
|
-
uint8_t num_values;
|
|
158
|
-
is.read(reinterpret_cast<char*>(&num_values), sizeof(num_values));
|
|
159
|
-
uint16_t seed_hash;
|
|
160
|
-
is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
|
|
146
|
+
read<uint8_t>(is); // unused
|
|
147
|
+
const auto serial_version = read<uint8_t>(is);
|
|
148
|
+
const auto family = read<uint8_t>(is);
|
|
149
|
+
const auto type = read<uint8_t>(is);
|
|
150
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
151
|
+
const auto num_values = read<uint8_t>(is);
|
|
152
|
+
const auto seed_hash = read<uint16_t>(is);
|
|
161
153
|
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
162
154
|
checker<true>::check_sketch_family(family, SKETCH_FAMILY);
|
|
163
155
|
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
164
156
|
const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
|
|
165
157
|
if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
166
158
|
|
|
167
|
-
|
|
168
|
-
is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
|
|
159
|
+
const auto theta = read<uint64_t>(is);
|
|
169
160
|
std::vector<Entry, AllocEntry> entries(allocator);
|
|
170
161
|
if (has_entries) {
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
uint32_t unused32;
|
|
174
|
-
is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
|
|
162
|
+
const auto num_entries = read<uint32_t>(is);
|
|
163
|
+
read<uint32_t>(is); // unused
|
|
175
164
|
entries.reserve(num_entries);
|
|
176
165
|
std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
|
|
177
|
-
|
|
166
|
+
read(is, keys.data(), num_entries * sizeof(uint64_t));
|
|
178
167
|
for (size_t i = 0; i < num_entries; ++i) {
|
|
179
168
|
aod<A> summary(num_values, allocator);
|
|
180
|
-
|
|
169
|
+
read(is, summary.data(), num_values * sizeof(double));
|
|
181
170
|
entries.push_back(Entry(keys[i], std::move(summary)));
|
|
182
171
|
}
|
|
183
172
|
}
|
|
@@ -191,20 +180,19 @@ template<typename A>
|
|
|
191
180
|
compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
|
|
192
181
|
ensure_minimum_memory(size, 16);
|
|
193
182
|
const char* ptr = static_cast<const char*>(bytes);
|
|
194
|
-
uint8_t
|
|
195
|
-
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
183
|
+
ptr += sizeof(uint8_t); // unused
|
|
196
184
|
uint8_t serial_version;
|
|
197
|
-
ptr += copy_from_mem(ptr,
|
|
185
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
198
186
|
uint8_t family;
|
|
199
|
-
ptr += copy_from_mem(ptr,
|
|
187
|
+
ptr += copy_from_mem(ptr, family);
|
|
200
188
|
uint8_t type;
|
|
201
|
-
ptr += copy_from_mem(ptr,
|
|
189
|
+
ptr += copy_from_mem(ptr, type);
|
|
202
190
|
uint8_t flags_byte;
|
|
203
|
-
ptr += copy_from_mem(ptr,
|
|
191
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
204
192
|
uint8_t num_values;
|
|
205
|
-
ptr += copy_from_mem(ptr,
|
|
193
|
+
ptr += copy_from_mem(ptr, num_values);
|
|
206
194
|
uint16_t seed_hash;
|
|
207
|
-
ptr += copy_from_mem(ptr,
|
|
195
|
+
ptr += copy_from_mem(ptr, seed_hash);
|
|
208
196
|
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
209
197
|
checker<true>::check_sketch_family(family, SKETCH_FAMILY);
|
|
210
198
|
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
@@ -212,14 +200,13 @@ compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A
|
|
|
212
200
|
if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
213
201
|
|
|
214
202
|
uint64_t theta;
|
|
215
|
-
ptr += copy_from_mem(ptr,
|
|
203
|
+
ptr += copy_from_mem(ptr, theta);
|
|
216
204
|
std::vector<Entry, AllocEntry> entries(allocator);
|
|
217
205
|
if (has_entries) {
|
|
218
206
|
ensure_minimum_memory(size, 24);
|
|
219
207
|
uint32_t num_entries;
|
|
220
|
-
ptr += copy_from_mem(ptr,
|
|
221
|
-
uint32_t
|
|
222
|
-
ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
|
|
208
|
+
ptr += copy_from_mem(ptr, num_entries);
|
|
209
|
+
ptr += sizeof(uint32_t); // unused
|
|
223
210
|
ensure_minimum_memory(size, 24 + (sizeof(uint64_t) + sizeof(double) * num_values) * num_entries);
|
|
224
211
|
entries.reserve(num_entries);
|
|
225
212
|
std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
|