datasketches 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
|
@@ -20,103 +20,70 @@
|
|
|
20
20
|
#ifndef THETA_UNION_HPP_
|
|
21
21
|
#define THETA_UNION_HPP_
|
|
22
22
|
|
|
23
|
-
#include
|
|
24
|
-
#include <functional>
|
|
25
|
-
#include <climits>
|
|
26
|
-
|
|
23
|
+
#include "serde.hpp"
|
|
27
24
|
#include "theta_sketch.hpp"
|
|
25
|
+
#include "theta_union_base.hpp"
|
|
28
26
|
|
|
29
27
|
namespace datasketches {
|
|
30
28
|
|
|
31
|
-
|
|
32
|
-
* author Alexander Saydakov
|
|
33
|
-
* author Lee Rhodes
|
|
34
|
-
* author Kevin Lang
|
|
35
|
-
*/
|
|
36
|
-
|
|
37
|
-
template<typename A>
|
|
29
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
38
30
|
class theta_union_alloc {
|
|
39
31
|
public:
|
|
40
|
-
|
|
32
|
+
using Entry = uint64_t;
|
|
33
|
+
using ExtractKey = trivial_extract_key;
|
|
34
|
+
using Sketch = theta_sketch_alloc<Allocator>;
|
|
35
|
+
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
|
36
|
+
using resize_factor = theta_constants::resize_factor;
|
|
37
|
+
|
|
38
|
+
struct pass_through_policy {
|
|
39
|
+
uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
|
|
40
|
+
unused(incoming_entry);
|
|
41
|
+
return internal_entry;
|
|
42
|
+
}
|
|
43
|
+
};
|
|
44
|
+
using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
|
|
41
45
|
|
|
42
46
|
// No constructor here. Use builder instead.
|
|
47
|
+
class builder;
|
|
43
48
|
|
|
44
49
|
/**
|
|
45
50
|
* This method is to update the union with a given sketch
|
|
46
51
|
* @param sketch to update the union with
|
|
47
52
|
*/
|
|
48
|
-
|
|
53
|
+
template<typename FwdSketch>
|
|
54
|
+
void update(FwdSketch&& sketch);
|
|
49
55
|
|
|
50
56
|
/**
|
|
51
57
|
* This method produces a copy of the current state of the union as a compact sketch.
|
|
52
58
|
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
53
59
|
* @return the result of the union
|
|
54
60
|
*/
|
|
55
|
-
|
|
61
|
+
CompactSketch get_result(bool ordered = true) const;
|
|
56
62
|
|
|
57
63
|
private:
|
|
58
|
-
|
|
59
|
-
uint64_t theta_;
|
|
60
|
-
update_theta_sketch_alloc<A> state_;
|
|
64
|
+
State state_;
|
|
61
65
|
|
|
62
66
|
// for builder
|
|
63
|
-
theta_union_alloc(uint64_t theta,
|
|
67
|
+
theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
|
|
64
68
|
};
|
|
65
69
|
|
|
66
|
-
// builder
|
|
67
|
-
|
|
68
70
|
template<typename A>
|
|
69
|
-
class theta_union_alloc<A>::builder {
|
|
71
|
+
class theta_union_alloc<A>::builder: public theta_base_builder<builder, A> {
|
|
70
72
|
public:
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
/**
|
|
74
|
-
* Set log2(k), where k is a nominal number of entries in the sketch
|
|
75
|
-
* @param lg_k base 2 logarithm of nominal number of entries
|
|
76
|
-
* @return this builder
|
|
77
|
-
*/
|
|
78
|
-
builder& set_lg_k(uint8_t lg_k);
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Set resize factor for the internal hash table (defaults to 8)
|
|
82
|
-
* @param rf resize factor
|
|
83
|
-
* @return this builder
|
|
84
|
-
*/
|
|
85
|
-
builder& set_resize_factor(resize_factor rf);
|
|
86
|
-
|
|
87
|
-
/**
|
|
88
|
-
* Set sampling probability (initial theta). The default is 1, so the sketch retains
|
|
89
|
-
* all entries until it reaches the limit, at which point it goes into the estimation mode
|
|
90
|
-
* and reduces the effective sampling probability (theta) as necessary.
|
|
91
|
-
* @param p sampling probability
|
|
92
|
-
* @return this builder
|
|
93
|
-
*/
|
|
94
|
-
builder& set_p(float p);
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* Set the seed for the hash function. Should be used carefully if needed.
|
|
98
|
-
* Sketches produced with different seed are not compatible
|
|
99
|
-
* and cannot be mixed in set operations.
|
|
100
|
-
* @param seed hash seed
|
|
101
|
-
* @return this builder
|
|
102
|
-
*/
|
|
103
|
-
builder& set_seed(uint64_t seed);
|
|
73
|
+
builder(const A& allocator = A());
|
|
104
74
|
|
|
105
75
|
/**
|
|
106
76
|
* This is to create an instance of the union with predefined parameters.
|
|
107
|
-
* @return
|
|
77
|
+
* @return an instance of the union
|
|
108
78
|
*/
|
|
109
79
|
theta_union_alloc<A> build() const;
|
|
110
|
-
|
|
111
|
-
private:
|
|
112
|
-
typename update_theta_sketch_alloc<A>::builder sketch_builder;
|
|
113
80
|
};
|
|
114
81
|
|
|
115
82
|
// alias with default allocator for convenience
|
|
116
|
-
|
|
83
|
+
using theta_union = theta_union_alloc<std::allocator<uint64_t>>;
|
|
117
84
|
|
|
118
85
|
} /* namespace datasketches */
|
|
119
86
|
|
|
120
87
|
#include "theta_union_impl.hpp"
|
|
121
88
|
|
|
122
|
-
#
|
|
89
|
+
#endif
|
|
@@ -17,6 +17,9 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
+
#ifndef THETA_UNION_BASE_IMPL_HPP_
|
|
21
|
+
#define THETA_UNION_BASE_IMPL_HPP_
|
|
22
|
+
|
|
20
23
|
#include <algorithm>
|
|
21
24
|
|
|
22
25
|
#include "conditional_forward.hpp"
|
|
@@ -82,3 +85,5 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
|
|
|
82
85
|
}
|
|
83
86
|
|
|
84
87
|
} /* namespace datasketches */
|
|
88
|
+
|
|
89
|
+
#endif
|
|
@@ -22,86 +22,30 @@
|
|
|
22
22
|
|
|
23
23
|
namespace datasketches {
|
|
24
24
|
|
|
25
|
-
/*
|
|
26
|
-
* author Alexander Saydakov
|
|
27
|
-
* author Lee Rhodes
|
|
28
|
-
* author Kevin Lang
|
|
29
|
-
*/
|
|
30
|
-
|
|
31
|
-
template<typename A>
|
|
32
|
-
theta_union_alloc<A>::theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state):
|
|
33
|
-
is_empty_(true), theta_(theta), state_(std::move(state)) {}
|
|
34
|
-
|
|
35
|
-
template<typename A>
|
|
36
|
-
void theta_union_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
|
|
37
|
-
if (sketch.is_empty()) return;
|
|
38
|
-
if (sketch.get_seed_hash() != state_.get_seed_hash()) throw std::invalid_argument("seed hash mismatch");
|
|
39
|
-
is_empty_ = false;
|
|
40
|
-
if (sketch.get_theta64() < theta_) theta_ = sketch.get_theta64();
|
|
41
|
-
if (sketch.is_ordered()) {
|
|
42
|
-
for (auto hash: sketch) {
|
|
43
|
-
if (hash >= theta_) break; // early stop
|
|
44
|
-
state_.internal_update(hash);
|
|
45
|
-
}
|
|
46
|
-
} else {
|
|
47
|
-
for (auto hash: sketch) if (hash < theta_) state_.internal_update(hash);
|
|
48
|
-
}
|
|
49
|
-
if (state_.get_theta64() < theta_) theta_ = state_.get_theta64();
|
|
50
|
-
}
|
|
51
|
-
|
|
52
25
|
template<typename A>
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if (theta_ >= state_.theta_ && state_.get_num_retained() <= nom_num_keys) return state_.compact(ordered);
|
|
57
|
-
uint64_t theta = std::min(theta_, state_.get_theta64());
|
|
58
|
-
vector_u64<A> keys(state_.get_num_retained());
|
|
59
|
-
uint32_t num_keys = 0;
|
|
60
|
-
for (auto key: state_) {
|
|
61
|
-
if (key < theta) keys[num_keys++] = key;
|
|
62
|
-
}
|
|
63
|
-
if (num_keys > nom_num_keys) {
|
|
64
|
-
std::nth_element(keys.begin(), keys.begin() + nom_num_keys, keys.begin() + num_keys);
|
|
65
|
-
theta = keys[nom_num_keys];
|
|
66
|
-
num_keys = nom_num_keys;
|
|
67
|
-
}
|
|
68
|
-
if (num_keys != state_.get_num_retained()) {
|
|
69
|
-
keys.resize(num_keys);
|
|
70
|
-
}
|
|
71
|
-
if (ordered) std::sort(keys.begin(), keys.end());
|
|
72
|
-
return compact_theta_sketch_alloc<A>(false, theta, std::move(keys), state_.get_seed_hash(), ordered);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
// builder
|
|
26
|
+
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
|
|
27
|
+
state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
|
|
28
|
+
{}
|
|
76
29
|
|
|
77
30
|
template<typename A>
|
|
78
|
-
typename
|
|
79
|
-
|
|
80
|
-
|
|
31
|
+
template<typename SS>
|
|
32
|
+
void theta_union_alloc<A>::update(SS&& sketch) {
|
|
33
|
+
state_.update(std::forward<SS>(sketch));
|
|
81
34
|
}
|
|
82
35
|
|
|
83
36
|
template<typename A>
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
return *this;
|
|
37
|
+
auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
|
|
38
|
+
return state_.get_result(ordered);
|
|
87
39
|
}
|
|
88
40
|
|
|
89
41
|
template<typename A>
|
|
90
|
-
|
|
91
|
-
sketch_builder.set_p(p);
|
|
92
|
-
return *this;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
template<typename A>
|
|
96
|
-
typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_seed(uint64_t seed) {
|
|
97
|
-
sketch_builder.set_seed(seed);
|
|
98
|
-
return *this;
|
|
99
|
-
}
|
|
42
|
+
theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
|
|
100
43
|
|
|
101
44
|
template<typename A>
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
45
|
+
auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
|
|
46
|
+
return theta_union_alloc(
|
|
47
|
+
this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
|
|
48
|
+
this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
|
105
49
|
}
|
|
106
50
|
|
|
107
51
|
} /* namespace datasketches */
|
|
@@ -34,7 +34,7 @@ namespace datasketches {
|
|
|
34
34
|
template<
|
|
35
35
|
typename Entry,
|
|
36
36
|
typename ExtractKey,
|
|
37
|
-
typename Allocator
|
|
37
|
+
typename Allocator
|
|
38
38
|
>
|
|
39
39
|
struct theta_update_sketch_base {
|
|
40
40
|
using resize_factor = theta_constants::resize_factor;
|
|
@@ -147,7 +147,7 @@ protected:
|
|
|
147
147
|
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
|
|
148
148
|
};
|
|
149
149
|
|
|
150
|
-
// key
|
|
150
|
+
// key extractor
|
|
151
151
|
|
|
152
152
|
struct trivial_extract_key {
|
|
153
153
|
template<typename T>
|
|
@@ -156,17 +156,7 @@ struct trivial_extract_key {
|
|
|
156
156
|
}
|
|
157
157
|
};
|
|
158
158
|
|
|
159
|
-
|
|
160
|
-
struct pair_extract_key {
|
|
161
|
-
K& operator()(std::pair<K, V>& entry) const {
|
|
162
|
-
return entry.first;
|
|
163
|
-
}
|
|
164
|
-
const K& operator()(const std::pair<K, V>& entry) const {
|
|
165
|
-
return entry.first;
|
|
166
|
-
}
|
|
167
|
-
};
|
|
168
|
-
|
|
169
|
-
// not zero
|
|
159
|
+
// key not zero
|
|
170
160
|
|
|
171
161
|
template<typename Entry, typename ExtractKey>
|
|
172
162
|
class key_not_zero {
|
|
@@ -195,12 +185,6 @@ static inline uint64_t compute_hash(const void* data, size_t length, uint64_t se
|
|
|
195
185
|
return (hashes.h1 >> 1); // Java implementation does unsigned shift >>> to make values positive
|
|
196
186
|
}
|
|
197
187
|
|
|
198
|
-
static inline uint16_t compute_seed_hash(uint64_t seed) {
|
|
199
|
-
HashState hashes;
|
|
200
|
-
MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
|
|
201
|
-
return hashes.h1;
|
|
202
|
-
}
|
|
203
|
-
|
|
204
188
|
// iterators
|
|
205
189
|
|
|
206
190
|
template<typename Entry, typename ExtractKey>
|
|
@@ -17,6 +17,9 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
+
#ifndef THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
|
|
21
|
+
#define THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
|
|
22
|
+
|
|
20
23
|
#include <iostream>
|
|
21
24
|
#include <sstream>
|
|
22
25
|
#include <algorithm>
|
|
@@ -69,7 +72,7 @@ entries_(nullptr)
|
|
|
69
72
|
|
|
70
73
|
template<typename EN, typename EK, typename A>
|
|
71
74
|
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(theta_update_sketch_base&& other) noexcept:
|
|
72
|
-
allocator_(other.allocator_),
|
|
75
|
+
allocator_(std::move(other.allocator_)),
|
|
73
76
|
is_empty_(other.is_empty_),
|
|
74
77
|
lg_cur_size_(other.lg_cur_size_),
|
|
75
78
|
lg_nom_size_(other.lg_nom_size_),
|
|
@@ -387,3 +390,5 @@ auto theta_const_iterator<Entry, ExtractKey>::operator*() const -> const Entry&
|
|
|
387
390
|
}
|
|
388
391
|
|
|
389
392
|
} /* namespace datasketches */
|
|
393
|
+
|
|
394
|
+
#endif
|
|
@@ -20,11 +20,10 @@
|
|
|
20
20
|
#include <iostream>
|
|
21
21
|
|
|
22
22
|
#include <catch.hpp>
|
|
23
|
-
#include <jaccard_similarity.hpp>
|
|
24
23
|
|
|
25
|
-
|
|
24
|
+
#include "theta_jaccard_similarity.hpp"
|
|
26
25
|
|
|
27
|
-
|
|
26
|
+
namespace datasketches {
|
|
28
27
|
|
|
29
28
|
TEST_CASE("theta jaccard: empty", "[theta_sketch]") {
|
|
30
29
|
auto sk_a = update_theta_sketch::builder().build();
|
|
@@ -17,10 +17,10 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
#include <catch.hpp>
|
|
21
20
|
#include <fstream>
|
|
22
21
|
#include <sstream>
|
|
23
22
|
|
|
23
|
+
#include <catch.hpp>
|
|
24
24
|
#include <theta_sketch.hpp>
|
|
25
25
|
|
|
26
26
|
namespace datasketches {
|
|
@@ -134,75 +134,7 @@ TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
|
|
|
134
134
|
REQUIRE(compact_sketch.get_upper_bound(1) > n);
|
|
135
135
|
}
|
|
136
136
|
|
|
137
|
-
TEST_CASE("theta sketch: deserialize
|
|
138
|
-
std::ifstream is;
|
|
139
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
140
|
-
is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
|
|
141
|
-
auto sketchptr = theta_sketch::deserialize(is);
|
|
142
|
-
REQUIRE(sketchptr->is_empty());
|
|
143
|
-
REQUIRE_FALSE(sketchptr->is_estimation_mode());
|
|
144
|
-
REQUIRE(sketchptr->get_num_retained() == 0);
|
|
145
|
-
REQUIRE(sketchptr->get_theta() == 1.0);
|
|
146
|
-
REQUIRE(sketchptr->get_estimate() == 0.0);
|
|
147
|
-
REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
|
|
148
|
-
REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
TEST_CASE("theta sketch: deserialize update empty from java as subclass", "[theta_sketch]") {
|
|
152
|
-
std::ifstream is;
|
|
153
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
154
|
-
is.open(inputPath + "theta_update_empty_from_java.sk", std::ios::binary);
|
|
155
|
-
auto sketch = update_theta_sketch::deserialize(is);
|
|
156
|
-
REQUIRE(sketch.is_empty());
|
|
157
|
-
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
158
|
-
REQUIRE(sketch.get_num_retained() == 0);
|
|
159
|
-
REQUIRE(sketch.get_theta() == 1.0);
|
|
160
|
-
REQUIRE(sketch.get_estimate() == 0.0);
|
|
161
|
-
REQUIRE(sketch.get_lower_bound(1) == 0.0);
|
|
162
|
-
REQUIRE(sketch.get_upper_bound(1) == 0.0);
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
TEST_CASE("theta sketch: deserialize update estimation from java as base", "[theta_sketch]") {
|
|
166
|
-
std::ifstream is;
|
|
167
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
168
|
-
is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
|
|
169
|
-
auto sketchptr = theta_sketch::deserialize(is);
|
|
170
|
-
REQUIRE_FALSE(sketchptr->is_empty());
|
|
171
|
-
REQUIRE(sketchptr->is_estimation_mode());
|
|
172
|
-
REQUIRE(sketchptr->get_num_retained() == 5324);
|
|
173
|
-
REQUIRE(sketchptr->get_estimate() == Approx(10000.0).margin(10000 * 0.01));
|
|
174
|
-
REQUIRE(sketchptr->get_lower_bound(1) < 10000);
|
|
175
|
-
REQUIRE(sketchptr->get_upper_bound(1) > 10000);
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
TEST_CASE("theta sketch: deserialize update estimation from java as subclass", "[theta_sketch]") {
|
|
179
|
-
std::ifstream is;
|
|
180
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
181
|
-
is.open(inputPath + "theta_update_estimation_from_java.sk", std::ios::binary);
|
|
182
|
-
auto sketch = update_theta_sketch::deserialize(is);
|
|
183
|
-
REQUIRE_FALSE(sketch.is_empty());
|
|
184
|
-
REQUIRE(sketch.is_estimation_mode());
|
|
185
|
-
REQUIRE(sketch.get_num_retained() == 5324);
|
|
186
|
-
REQUIRE(sketch.get_estimate() == Approx(10000.0).margin(10000 * 0.01));
|
|
187
|
-
REQUIRE(sketch.get_lower_bound(1) < 10000);
|
|
188
|
-
REQUIRE(sketch.get_upper_bound(1) > 10000);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
TEST_CASE("theta sketch: deserialize compact empty from java as base", "[theta_sketch]") {
|
|
192
|
-
std::ifstream is;
|
|
193
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
194
|
-
is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
|
|
195
|
-
auto sketchptr = theta_sketch::deserialize(is);
|
|
196
|
-
REQUIRE(sketchptr->is_empty());
|
|
197
|
-
REQUIRE_FALSE(sketchptr->is_estimation_mode());
|
|
198
|
-
REQUIRE(sketchptr->get_num_retained() == 0);
|
|
199
|
-
REQUIRE(sketchptr->get_theta() == 1.0);
|
|
200
|
-
REQUIRE(sketchptr->get_estimate() == 0.0);
|
|
201
|
-
REQUIRE(sketchptr->get_lower_bound(1) == 0.0);
|
|
202
|
-
REQUIRE(sketchptr->get_upper_bound(1) == 0.0);
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
TEST_CASE("theta sketch: deserialize compact empty from java as subclass", "[theta_sketch]") {
|
|
137
|
+
TEST_CASE("theta sketch: deserialize compact empty from java", "[theta_sketch]") {
|
|
206
138
|
std::ifstream is;
|
|
207
139
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
208
140
|
is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
|
|
@@ -216,21 +148,7 @@ TEST_CASE("theta sketch: deserialize compact empty from java as subclass", "[the
|
|
|
216
148
|
REQUIRE(sketch.get_upper_bound(1) == 0.0);
|
|
217
149
|
}
|
|
218
150
|
|
|
219
|
-
TEST_CASE("theta sketch: deserialize single item from java
|
|
220
|
-
std::ifstream is;
|
|
221
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
222
|
-
is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
|
|
223
|
-
auto sketchptr = theta_sketch::deserialize(is);
|
|
224
|
-
REQUIRE_FALSE(sketchptr->is_empty());
|
|
225
|
-
REQUIRE_FALSE(sketchptr->is_estimation_mode());
|
|
226
|
-
REQUIRE(sketchptr->get_num_retained() == 1);
|
|
227
|
-
REQUIRE(sketchptr->get_theta() == 1.0);
|
|
228
|
-
REQUIRE(sketchptr->get_estimate() == 1.0);
|
|
229
|
-
REQUIRE(sketchptr->get_lower_bound(1) == 1.0);
|
|
230
|
-
REQUIRE(sketchptr->get_upper_bound(1) == 1.0);
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
TEST_CASE("theta sketch: deserialize single item from java as subclass", "[theta_sketch]") {
|
|
151
|
+
TEST_CASE("theta sketch: deserialize single item from java", "[theta_sketch]") {
|
|
234
152
|
std::ifstream is;
|
|
235
153
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
236
154
|
is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
|
|
@@ -244,55 +162,21 @@ TEST_CASE("theta sketch: deserialize single item from java as subclass", "[theta
|
|
|
244
162
|
REQUIRE(sketch.get_upper_bound(1) == 1.0);
|
|
245
163
|
}
|
|
246
164
|
|
|
247
|
-
TEST_CASE("theta sketch: deserialize compact estimation from java
|
|
248
|
-
std::ifstream is;
|
|
249
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
250
|
-
is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
|
|
251
|
-
auto sketchptr = theta_sketch::deserialize(is);
|
|
252
|
-
REQUIRE_FALSE(sketchptr->is_empty());
|
|
253
|
-
REQUIRE(sketchptr->is_estimation_mode());
|
|
254
|
-
REQUIRE(sketchptr->is_ordered());
|
|
255
|
-
REQUIRE(sketchptr->get_num_retained() == 4342);
|
|
256
|
-
REQUIRE(sketchptr->get_theta() == Approx(0.531700444213199).margin(1e-10));
|
|
257
|
-
REQUIRE(sketchptr->get_estimate() == Approx(8166.25234614053).margin(1e-10));
|
|
258
|
-
REQUIRE(sketchptr->get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
|
|
259
|
-
REQUIRE(sketchptr->get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
|
|
260
|
-
|
|
261
|
-
// the same construction process in Java must have produced exactly the same sketch
|
|
262
|
-
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
263
|
-
const int n = 8192;
|
|
264
|
-
for (int i = 0; i < n; i++) update_sketch.update(i);
|
|
265
|
-
REQUIRE(sketchptr->get_num_retained() == update_sketch.get_num_retained());
|
|
266
|
-
REQUIRE(sketchptr->get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
|
|
267
|
-
REQUIRE(sketchptr->get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
|
|
268
|
-
REQUIRE(sketchptr->get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
|
|
269
|
-
REQUIRE(sketchptr->get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
|
|
270
|
-
REQUIRE(sketchptr->get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
|
|
271
|
-
REQUIRE(sketchptr->get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
|
|
272
|
-
REQUIRE(sketchptr->get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
|
|
273
|
-
REQUIRE(sketchptr->get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
|
|
274
|
-
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
275
|
-
// the sketches are ordered, so the iteration sequence must match exactly
|
|
276
|
-
auto iter = sketchptr->begin();
|
|
277
|
-
for (auto key: compact_sketch) {
|
|
278
|
-
REQUIRE(*iter == key);
|
|
279
|
-
++iter;
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
TEST_CASE("theta sketch: deserialize compact estimation from java as subclass", "[theta_sketch]") {
|
|
165
|
+
TEST_CASE("theta sketch: deserialize compact estimation from java", "[theta_sketch]") {
|
|
284
166
|
std::ifstream is;
|
|
285
167
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
286
168
|
is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
|
|
287
169
|
auto sketch = compact_theta_sketch::deserialize(is);
|
|
288
170
|
REQUIRE_FALSE(sketch.is_empty());
|
|
289
171
|
REQUIRE(sketch.is_estimation_mode());
|
|
172
|
+
REQUIRE(sketch.is_ordered());
|
|
290
173
|
REQUIRE(sketch.get_num_retained() == 4342);
|
|
291
174
|
REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
|
|
292
175
|
REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
|
|
293
176
|
REQUIRE(sketch.get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
|
|
294
177
|
REQUIRE(sketch.get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
|
|
295
178
|
|
|
179
|
+
// the same construction process in Java must have produced exactly the same sketch
|
|
296
180
|
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
297
181
|
const int n = 8192;
|
|
298
182
|
for (int i = 0; i < n; i++) update_sketch.update(i);
|
|
@@ -305,132 +189,51 @@ TEST_CASE("theta sketch: deserialize compact estimation from java as subclass",
|
|
|
305
189
|
REQUIRE(sketch.get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
|
|
306
190
|
REQUIRE(sketch.get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
|
|
307
191
|
REQUIRE(sketch.get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
|
|
192
|
+
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
193
|
+
// the sketches are ordered, so the iteration sequence must match exactly
|
|
194
|
+
auto iter = sketch.begin();
|
|
195
|
+
for (const auto& key: compact_sketch) {
|
|
196
|
+
REQUIRE(*iter == key);
|
|
197
|
+
++iter;
|
|
198
|
+
}
|
|
308
199
|
}
|
|
309
200
|
|
|
310
|
-
TEST_CASE("theta sketch: serialize deserialize stream and bytes
|
|
201
|
+
TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[theta_sketch]") {
|
|
311
202
|
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
312
203
|
const int n = 8192;
|
|
313
204
|
for (int i = 0; i < n; i++) update_sketch.update(i);
|
|
314
205
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
REQUIRE(bytes.
|
|
321
|
-
for (size_t i = 0; i < bytes.size(); ++i) {
|
|
322
|
-
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
// deserialize as base class
|
|
326
|
-
{
|
|
327
|
-
s.seekg(0); // rewind
|
|
328
|
-
auto deserialized_sketch_ptr1 = theta_sketch::deserialize(s);
|
|
329
|
-
auto deserialized_sketch_ptr2 = theta_sketch::deserialize(bytes.data(), bytes.size());
|
|
330
|
-
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
|
331
|
-
REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
|
|
332
|
-
REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
|
|
333
|
-
REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
|
|
334
|
-
REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
|
|
335
|
-
REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
|
|
336
|
-
REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
|
|
337
|
-
REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
|
|
338
|
-
// hash tables must be identical since they are restored from dumps, and iteration is deterministic
|
|
339
|
-
auto iter = deserialized_sketch_ptr1->begin();
|
|
340
|
-
for (auto key: *deserialized_sketch_ptr2) {
|
|
341
|
-
REQUIRE(*iter == key);
|
|
342
|
-
++iter;
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
// deserialize as subclass
|
|
347
|
-
{
|
|
348
|
-
s.seekg(0); // rewind
|
|
349
|
-
update_theta_sketch deserialized_sketch1 = update_theta_sketch::deserialize(s);
|
|
350
|
-
update_theta_sketch deserialized_sketch2 = update_theta_sketch::deserialize(bytes.data(), bytes.size());
|
|
351
|
-
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
|
352
|
-
REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
|
|
353
|
-
REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
|
|
354
|
-
REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
|
|
355
|
-
REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
|
|
356
|
-
REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
|
|
357
|
-
REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
|
|
358
|
-
REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
|
|
359
|
-
// hash tables must be identical since they are restored from dumps, and iteration is deterministic
|
|
360
|
-
auto iter = deserialized_sketch1.begin();
|
|
361
|
-
for (auto key: deserialized_sketch2) {
|
|
362
|
-
REQUIRE(*iter == key);
|
|
363
|
-
++iter;
|
|
364
|
-
}
|
|
365
|
-
}
|
|
206
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
207
|
+
update_sketch.compact().serialize(s);
|
|
208
|
+
auto bytes = update_sketch.compact().serialize();
|
|
209
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
|
210
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
|
211
|
+
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
|
366
212
|
}
|
|
367
213
|
|
|
368
|
-
//
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
REQUIRE(deserialized_sketch_ptr2->is_empty() == deserialized_sketch_ptr1->is_empty());
|
|
385
|
-
REQUIRE(deserialized_sketch_ptr2->is_ordered() == deserialized_sketch_ptr1->is_ordered());
|
|
386
|
-
REQUIRE(deserialized_sketch_ptr2->get_num_retained() == deserialized_sketch_ptr1->get_num_retained());
|
|
387
|
-
REQUIRE(deserialized_sketch_ptr2->get_theta() == deserialized_sketch_ptr1->get_theta());
|
|
388
|
-
REQUIRE(deserialized_sketch_ptr2->get_estimate() == deserialized_sketch_ptr1->get_estimate());
|
|
389
|
-
REQUIRE(deserialized_sketch_ptr2->get_lower_bound(1) == deserialized_sketch_ptr1->get_lower_bound(1));
|
|
390
|
-
REQUIRE(deserialized_sketch_ptr2->get_upper_bound(1) == deserialized_sketch_ptr1->get_upper_bound(1));
|
|
391
|
-
// the sketches are ordered, so the iteration sequence must match exactly
|
|
392
|
-
auto iter = deserialized_sketch_ptr1->begin();
|
|
393
|
-
for (auto key: *deserialized_sketch_ptr2) {
|
|
394
|
-
REQUIRE(*iter == key);
|
|
395
|
-
++iter;
|
|
396
|
-
}
|
|
397
|
-
}
|
|
398
|
-
|
|
399
|
-
// deserialize as subclass
|
|
400
|
-
{
|
|
401
|
-
s.seekg(0); // rewind
|
|
402
|
-
compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
|
|
403
|
-
compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
|
404
|
-
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
|
405
|
-
REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
|
|
406
|
-
REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
|
|
407
|
-
REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
|
|
408
|
-
REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
|
|
409
|
-
REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
|
|
410
|
-
REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
|
|
411
|
-
REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
|
|
412
|
-
// the sketches are ordered, so the iteration sequence must match exactly
|
|
413
|
-
auto iter = deserialized_sketch1.begin();
|
|
414
|
-
for (auto key: deserialized_sketch2) {
|
|
415
|
-
REQUIRE(*iter == key);
|
|
416
|
-
++iter;
|
|
417
|
-
}
|
|
418
|
-
}
|
|
214
|
+
s.seekg(0); // rewind
|
|
215
|
+
compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
|
|
216
|
+
compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
|
217
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
|
218
|
+
REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
|
|
219
|
+
REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
|
|
220
|
+
REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
|
|
221
|
+
REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
|
|
222
|
+
REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
|
|
223
|
+
REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
|
|
224
|
+
REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
|
|
225
|
+
// the sketches are ordered, so the iteration sequence must match exactly
|
|
226
|
+
auto iter = deserialized_sketch1.begin();
|
|
227
|
+
for (auto key: deserialized_sketch2) {
|
|
228
|
+
REQUIRE(*iter == key);
|
|
229
|
+
++iter;
|
|
419
230
|
}
|
|
420
231
|
}
|
|
421
232
|
|
|
422
|
-
TEST_CASE("theta sketch: deserialize update single item buffer overrun", "[theta_sketch]") {
|
|
423
|
-
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
424
|
-
update_sketch.update(1);
|
|
425
|
-
theta_sketch::vector_bytes bytes = update_sketch.serialize();
|
|
426
|
-
REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
|
|
427
|
-
REQUIRE_THROWS_AS(update_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
428
|
-
}
|
|
429
|
-
|
|
430
233
|
TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") {
|
|
431
234
|
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
432
235
|
update_sketch.update(1);
|
|
433
|
-
|
|
236
|
+
auto bytes = update_sketch.compact().serialize();
|
|
434
237
|
REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
|
|
435
238
|
REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
436
239
|
}
|