datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -23,8 +23,8 @@
|
|
|
23
23
|
namespace datasketches {
|
|
24
24
|
|
|
25
25
|
template<typename A>
|
|
26
|
-
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
|
|
27
|
-
state_(lg_cur_size, lg_nom_size, rf, theta, seed,
|
|
26
|
+
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator):
|
|
27
|
+
state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
|
|
28
28
|
{}
|
|
29
29
|
|
|
30
30
|
template<typename A>
|
|
@@ -38,14 +38,17 @@ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
|
|
|
38
38
|
return state_.get_result(ordered);
|
|
39
39
|
}
|
|
40
40
|
|
|
41
|
+
template<typename A>
|
|
42
|
+
void theta_union_alloc<A>::reset() {
|
|
43
|
+
state_.reset();
|
|
44
|
+
}
|
|
45
|
+
|
|
41
46
|
template<typename A>
|
|
42
47
|
theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
|
|
43
48
|
|
|
44
49
|
template<typename A>
|
|
45
50
|
auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
|
|
46
|
-
return theta_union_alloc(
|
|
47
|
-
this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
|
|
48
|
-
this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
|
51
|
+
return theta_union_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
|
|
49
52
|
}
|
|
50
53
|
|
|
51
54
|
} /* namespace datasketches */
|
|
@@ -40,8 +40,8 @@ struct theta_update_sketch_base {
|
|
|
40
40
|
using resize_factor = theta_constants::resize_factor;
|
|
41
41
|
using comparator = compare_by_key<ExtractKey>;
|
|
42
42
|
|
|
43
|
-
theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
44
|
-
uint64_t seed, const Allocator& allocator, bool is_empty = true);
|
|
43
|
+
theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
|
|
44
|
+
uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
|
|
45
45
|
theta_update_sketch_base(const theta_update_sketch_base& other);
|
|
46
46
|
theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
|
|
47
47
|
~theta_update_sketch_base();
|
|
@@ -53,6 +53,8 @@ struct theta_update_sketch_base {
|
|
|
53
53
|
inline uint64_t hash_and_screen(const void* data, size_t length);
|
|
54
54
|
|
|
55
55
|
inline std::pair<iterator, bool> find(uint64_t key) const;
|
|
56
|
+
static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
|
|
57
|
+
|
|
56
58
|
|
|
57
59
|
template<typename FwdEntry>
|
|
58
60
|
inline void insert(iterator it, FwdEntry&& entry);
|
|
@@ -73,6 +75,7 @@ struct theta_update_sketch_base {
|
|
|
73
75
|
uint8_t lg_cur_size_;
|
|
74
76
|
uint8_t lg_nom_size_;
|
|
75
77
|
resize_factor rf_;
|
|
78
|
+
float p_;
|
|
76
79
|
uint32_t num_entries_;
|
|
77
80
|
uint64_t theta_;
|
|
78
81
|
uint64_t seed_;
|
|
@@ -81,6 +84,7 @@ struct theta_update_sketch_base {
|
|
|
81
84
|
void resize();
|
|
82
85
|
void rebuild();
|
|
83
86
|
void trim();
|
|
87
|
+
void reset();
|
|
84
88
|
|
|
85
89
|
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
|
|
86
90
|
static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
|
|
@@ -92,11 +96,14 @@ struct theta_update_sketch_base {
|
|
|
92
96
|
template<typename Derived, typename Allocator>
|
|
93
97
|
class theta_base_builder {
|
|
94
98
|
public:
|
|
99
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
|
95
100
|
using resize_factor = theta_constants::resize_factor;
|
|
96
101
|
static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
|
|
97
102
|
static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
|
|
98
|
-
|
|
99
|
-
|
|
103
|
+
// TODO: The following defaults are redundant and deprecated. Will be removed in the
|
|
104
|
+
// next major version release
|
|
105
|
+
static const uint8_t DEFAULT_LG_K = theta_constants::DEFAULT_LG_K;
|
|
106
|
+
static const resize_factor DEFAULT_RESIZE_FACTOR = theta_constants::DEFAULT_RESIZE_FACTOR;
|
|
100
107
|
|
|
101
108
|
/**
|
|
102
109
|
* Creates and instance of the builder with default parameters.
|
|
@@ -144,7 +151,6 @@ protected:
|
|
|
144
151
|
|
|
145
152
|
uint64_t starting_theta() const;
|
|
146
153
|
uint8_t starting_lg_size() const;
|
|
147
|
-
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
|
|
148
154
|
};
|
|
149
155
|
|
|
150
156
|
// key extractor
|
|
@@ -24,22 +24,25 @@
|
|
|
24
24
|
#include <sstream>
|
|
25
25
|
#include <algorithm>
|
|
26
26
|
|
|
27
|
+
#include "theta_helpers.hpp"
|
|
28
|
+
|
|
27
29
|
namespace datasketches {
|
|
28
30
|
|
|
29
31
|
template<typename EN, typename EK, typename A>
|
|
30
|
-
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
|
|
32
|
+
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
|
|
31
33
|
allocator_(allocator),
|
|
32
34
|
is_empty_(is_empty),
|
|
33
35
|
lg_cur_size_(lg_cur_size),
|
|
34
36
|
lg_nom_size_(lg_nom_size),
|
|
35
37
|
rf_(rf),
|
|
38
|
+
p_(p),
|
|
36
39
|
num_entries_(0),
|
|
37
40
|
theta_(theta),
|
|
38
41
|
seed_(seed),
|
|
39
42
|
entries_(nullptr)
|
|
40
43
|
{
|
|
41
44
|
if (lg_cur_size > 0) {
|
|
42
|
-
const size_t size =
|
|
45
|
+
const size_t size = 1ULL << lg_cur_size;
|
|
43
46
|
entries_ = allocator_.allocate(size);
|
|
44
47
|
for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
|
|
45
48
|
}
|
|
@@ -52,13 +55,14 @@ is_empty_(other.is_empty_),
|
|
|
52
55
|
lg_cur_size_(other.lg_cur_size_),
|
|
53
56
|
lg_nom_size_(other.lg_nom_size_),
|
|
54
57
|
rf_(other.rf_),
|
|
58
|
+
p_(other.p_),
|
|
55
59
|
num_entries_(other.num_entries_),
|
|
56
60
|
theta_(other.theta_),
|
|
57
61
|
seed_(other.seed_),
|
|
58
62
|
entries_(nullptr)
|
|
59
63
|
{
|
|
60
64
|
if (other.entries_ != nullptr) {
|
|
61
|
-
const size_t size =
|
|
65
|
+
const size_t size = 1ULL << lg_cur_size_;
|
|
62
66
|
entries_ = allocator_.allocate(size);
|
|
63
67
|
for (size_t i = 0; i < size; ++i) {
|
|
64
68
|
if (EK()(other.entries_[i]) != 0) {
|
|
@@ -77,6 +81,7 @@ is_empty_(other.is_empty_),
|
|
|
77
81
|
lg_cur_size_(other.lg_cur_size_),
|
|
78
82
|
lg_nom_size_(other.lg_nom_size_),
|
|
79
83
|
rf_(other.rf_),
|
|
84
|
+
p_(other.p_),
|
|
80
85
|
num_entries_(other.num_entries_),
|
|
81
86
|
theta_(other.theta_),
|
|
82
87
|
seed_(other.seed_),
|
|
@@ -89,7 +94,7 @@ template<typename EN, typename EK, typename A>
|
|
|
89
94
|
theta_update_sketch_base<EN, EK, A>::~theta_update_sketch_base()
|
|
90
95
|
{
|
|
91
96
|
if (entries_ != nullptr) {
|
|
92
|
-
const size_t size =
|
|
97
|
+
const size_t size = 1ULL << lg_cur_size_;
|
|
93
98
|
for (size_t i = 0; i < size; ++i) {
|
|
94
99
|
if (EK()(entries_[i]) != 0) entries_[i].~EN();
|
|
95
100
|
}
|
|
@@ -105,6 +110,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
|
|
|
105
110
|
std::swap(lg_cur_size_, copy.lg_cur_size_);
|
|
106
111
|
std::swap(lg_nom_size_, copy.lg_nom_size_);
|
|
107
112
|
std::swap(rf_, copy.rf_);
|
|
113
|
+
std::swap(p_, copy.p_);
|
|
108
114
|
std::swap(num_entries_, copy.num_entries_);
|
|
109
115
|
std::swap(theta_, copy.theta_);
|
|
110
116
|
std::swap(seed_, copy.seed_);
|
|
@@ -119,6 +125,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
|
|
|
119
125
|
std::swap(lg_cur_size_, other.lg_cur_size_);
|
|
120
126
|
std::swap(lg_nom_size_, other.lg_nom_size_);
|
|
121
127
|
std::swap(rf_, other.rf_);
|
|
128
|
+
std::swap(p_, other.p_);
|
|
122
129
|
std::swap(num_entries_, other.num_entries_);
|
|
123
130
|
std::swap(theta_, other.theta_);
|
|
124
131
|
std::swap(seed_, other.seed_);
|
|
@@ -136,18 +143,23 @@ uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data,
|
|
|
136
143
|
|
|
137
144
|
template<typename EN, typename EK, typename A>
|
|
138
145
|
auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
146
|
+
return find(entries_, lg_cur_size_, key);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
template<typename EN, typename EK, typename A>
|
|
150
|
+
auto theta_update_sketch_base<EN, EK, A>::find(EN* entries, uint8_t lg_size, uint64_t key) -> std::pair<iterator, bool> {
|
|
151
|
+
const uint32_t size = 1 << lg_size;
|
|
152
|
+
const uint32_t mask = size - 1;
|
|
153
|
+
const uint32_t stride = get_stride(key, lg_size);
|
|
142
154
|
uint32_t index = static_cast<uint32_t>(key) & mask;
|
|
143
155
|
// search for duplicate or zero
|
|
144
156
|
const uint32_t loop_index = index;
|
|
145
157
|
do {
|
|
146
|
-
const uint64_t probe = EK()(
|
|
158
|
+
const uint64_t probe = EK()(entries[index]);
|
|
147
159
|
if (probe == 0) {
|
|
148
|
-
return std::pair<iterator, bool>(&
|
|
160
|
+
return std::pair<iterator, bool>(&entries[index], false);
|
|
149
161
|
} else if (probe == key) {
|
|
150
|
-
return std::pair<iterator, bool>(&
|
|
162
|
+
return std::pair<iterator, bool>(&entries[index], true);
|
|
151
163
|
}
|
|
152
164
|
index = (index + stride) & mask;
|
|
153
165
|
} while (index != loop_index);
|
|
@@ -175,13 +187,13 @@ auto theta_update_sketch_base<EN, EK, A>::begin() const -> iterator {
|
|
|
175
187
|
|
|
176
188
|
template<typename EN, typename EK, typename A>
|
|
177
189
|
auto theta_update_sketch_base<EN, EK, A>::end() const -> iterator {
|
|
178
|
-
return &entries_[
|
|
190
|
+
return &entries_[1ULL << lg_cur_size_];
|
|
179
191
|
}
|
|
180
192
|
|
|
181
193
|
template<typename EN, typename EK, typename A>
|
|
182
194
|
uint32_t theta_update_sketch_base<EN, EK, A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
|
|
183
195
|
const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
|
|
184
|
-
return std::floor(fraction * (1 << lg_cur_size));
|
|
196
|
+
return static_cast<uint32_t>(std::floor(fraction * (1 << lg_cur_size)));
|
|
185
197
|
}
|
|
186
198
|
|
|
187
199
|
template<typename EN, typename EK, typename A>
|
|
@@ -192,29 +204,29 @@ uint32_t theta_update_sketch_base<EN, EK, A>::get_stride(uint64_t key, uint8_t l
|
|
|
192
204
|
|
|
193
205
|
template<typename EN, typename EK, typename A>
|
|
194
206
|
void theta_update_sketch_base<EN, EK, A>::resize() {
|
|
195
|
-
const size_t old_size =
|
|
196
|
-
const uint8_t
|
|
197
|
-
const
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
EN* old_entries = entries_;
|
|
201
|
-
entries_ = allocator_.allocate(new_size);
|
|
202
|
-
for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
|
|
203
|
-
num_entries_ = 0;
|
|
207
|
+
const size_t old_size = 1ULL << lg_cur_size_;
|
|
208
|
+
const uint8_t lg_new_size = std::min<uint8_t>(lg_cur_size_ + static_cast<uint8_t>(rf_), lg_nom_size_ + 1);
|
|
209
|
+
const size_t new_size = 1ULL << lg_new_size;
|
|
210
|
+
EN* new_entries = allocator_.allocate(new_size);
|
|
211
|
+
for (size_t i = 0; i < new_size; ++i) EK()(new_entries[i]) = 0;
|
|
204
212
|
for (size_t i = 0; i < old_size; ++i) {
|
|
205
|
-
const uint64_t key = EK()(
|
|
213
|
+
const uint64_t key = EK()(entries_[i]);
|
|
206
214
|
if (key != 0) {
|
|
207
|
-
|
|
208
|
-
|
|
215
|
+
// always finds an empty slot in a larger table
|
|
216
|
+
new (find(new_entries, lg_new_size, key).first) EN(std::move(entries_[i]));
|
|
217
|
+
entries_[i].~EN();
|
|
218
|
+
EK()(entries_[i]) = 0;
|
|
209
219
|
}
|
|
210
220
|
}
|
|
211
|
-
|
|
221
|
+
std::swap(entries_, new_entries);
|
|
222
|
+
lg_cur_size_ = lg_new_size;
|
|
223
|
+
allocator_.deallocate(new_entries, old_size);
|
|
212
224
|
}
|
|
213
225
|
|
|
214
226
|
// assumes number of entries > nominal size
|
|
215
227
|
template<typename EN, typename EK, typename A>
|
|
216
228
|
void theta_update_sketch_base<EN, EK, A>::rebuild() {
|
|
217
|
-
const size_t size =
|
|
229
|
+
const size_t size = 1ULL << lg_cur_size_;
|
|
218
230
|
const uint32_t nominal_size = 1 << lg_nom_size_;
|
|
219
231
|
|
|
220
232
|
// empty entries have uninitialized payloads
|
|
@@ -227,10 +239,10 @@ void theta_update_sketch_base<EN, EK, A>::rebuild() {
|
|
|
227
239
|
const size_t num_old_entries = num_entries_;
|
|
228
240
|
entries_ = allocator_.allocate(size);
|
|
229
241
|
for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
|
|
230
|
-
num_entries_ =
|
|
242
|
+
num_entries_ = nominal_size;
|
|
231
243
|
// relies on consolidating non-empty entries to the front
|
|
232
244
|
for (size_t i = 0; i < nominal_size; ++i) {
|
|
233
|
-
|
|
245
|
+
new (find(EK()(old_entries[i])).first) EN(std::move(old_entries[i]));
|
|
234
246
|
old_entries[i].~EN();
|
|
235
247
|
}
|
|
236
248
|
for (size_t i = nominal_size; i < num_old_entries; ++i) old_entries[i].~EN();
|
|
@@ -242,6 +254,29 @@ void theta_update_sketch_base<EN, EK, A>::trim() {
|
|
|
242
254
|
if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
|
|
243
255
|
}
|
|
244
256
|
|
|
257
|
+
template<typename EN, typename EK, typename A>
|
|
258
|
+
void theta_update_sketch_base<EN, EK, A>::reset() {
|
|
259
|
+
const size_t cur_size = 1ULL << lg_cur_size_;
|
|
260
|
+
for (size_t i = 0; i < cur_size; ++i) {
|
|
261
|
+
if (EK()(entries_[i]) != 0) {
|
|
262
|
+
entries_[i].~EN();
|
|
263
|
+
EK()(entries_[i]) = 0;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
const uint8_t starting_lg_size = theta_build_helper<true>::starting_sub_multiple(
|
|
267
|
+
lg_nom_size_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
|
|
268
|
+
if (starting_lg_size != lg_cur_size_) {
|
|
269
|
+
allocator_.deallocate(entries_, cur_size);
|
|
270
|
+
lg_cur_size_ = starting_lg_size;
|
|
271
|
+
const size_t new_size = 1ULL << starting_lg_size;
|
|
272
|
+
entries_ = allocator_.allocate(new_size);
|
|
273
|
+
for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
|
|
274
|
+
}
|
|
275
|
+
num_entries_ = 0;
|
|
276
|
+
theta_ = theta_build_helper<true>::starting_theta_from_p(p_);
|
|
277
|
+
is_empty_ = true;
|
|
278
|
+
}
|
|
279
|
+
|
|
245
280
|
template<typename EN, typename EK, typename A>
|
|
246
281
|
void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
|
|
247
282
|
// find the first empty slot
|
|
@@ -266,7 +301,11 @@ void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, siz
|
|
|
266
301
|
|
|
267
302
|
template<typename Derived, typename Allocator>
|
|
268
303
|
theta_base_builder<Derived, Allocator>::theta_base_builder(const Allocator& allocator):
|
|
269
|
-
allocator_(allocator),
|
|
304
|
+
allocator_(allocator),
|
|
305
|
+
lg_k_(theta_constants::DEFAULT_LG_K),
|
|
306
|
+
rf_(theta_constants::DEFAULT_RESIZE_FACTOR),
|
|
307
|
+
p_(1),
|
|
308
|
+
seed_(DEFAULT_SEED) {}
|
|
270
309
|
|
|
271
310
|
template<typename Derived, typename Allocator>
|
|
272
311
|
Derived& theta_base_builder<Derived, Allocator>::set_lg_k(uint8_t lg_k) {
|
|
@@ -301,18 +340,12 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
|
|
|
301
340
|
|
|
302
341
|
template<typename Derived, typename Allocator>
|
|
303
342
|
uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
|
|
304
|
-
|
|
305
|
-
return theta_constants::MAX_THETA;
|
|
343
|
+
return theta_build_helper<true>::starting_theta_from_p(p_);
|
|
306
344
|
}
|
|
307
345
|
|
|
308
346
|
template<typename Derived, typename Allocator>
|
|
309
347
|
uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
|
|
310
|
-
return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
template<typename Derived, typename Allocator>
|
|
314
|
-
uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
|
315
|
-
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
|
348
|
+
return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
|
316
349
|
}
|
|
317
350
|
|
|
318
351
|
// iterator
|
|
@@ -37,7 +37,7 @@ TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
|
|
|
37
37
|
TEST_CASE("theta a-not-b: non empty no retained keys", "[theta_a_not_b]") {
|
|
38
38
|
update_theta_sketch a = update_theta_sketch::builder().build();
|
|
39
39
|
a.update(1);
|
|
40
|
-
update_theta_sketch b = update_theta_sketch::builder().set_p(0.
|
|
40
|
+
update_theta_sketch b = update_theta_sketch::builder().set_p(0.001f).build();
|
|
41
41
|
theta_a_not_b a_not_b;
|
|
42
42
|
|
|
43
43
|
// B is still empty
|
|
@@ -167,6 +167,28 @@ TEST_CASE("theta a-not-b: estimation mode half overlap", "[theta_a_not_b]") {
|
|
|
167
167
|
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
168
168
|
}
|
|
169
169
|
|
|
170
|
+
TEST_CASE("theta a-not-b: estimation mode half overlap wrapped compact", "[theta_a_not_b]") {
|
|
171
|
+
update_theta_sketch a = update_theta_sketch::builder().build();
|
|
172
|
+
int value = 0;
|
|
173
|
+
for (int i = 0; i < 10000; i++) a.update(value++);
|
|
174
|
+
auto bytes_a = a.compact().serialize();
|
|
175
|
+
|
|
176
|
+
update_theta_sketch b = update_theta_sketch::builder().build();
|
|
177
|
+
value = 5000;
|
|
178
|
+
for (int i = 0; i < 10000; i++) b.update(value++);
|
|
179
|
+
auto bytes_b = b.compact().serialize();
|
|
180
|
+
|
|
181
|
+
theta_a_not_b a_not_b;
|
|
182
|
+
|
|
183
|
+
auto result = a_not_b.compute(
|
|
184
|
+
wrapped_compact_theta_sketch::wrap(bytes_a.data(), bytes_a.size()),
|
|
185
|
+
wrapped_compact_theta_sketch::wrap(bytes_b.data(), bytes_b.size())
|
|
186
|
+
);
|
|
187
|
+
REQUIRE_FALSE(result.is_empty());
|
|
188
|
+
REQUIRE(result.is_estimation_mode());
|
|
189
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
190
|
+
}
|
|
191
|
+
|
|
170
192
|
TEST_CASE("theta a-not-b: estimation mode disjoint", "[theta_a_not_b]") {
|
|
171
193
|
update_theta_sketch a = update_theta_sketch::builder().build();
|
|
172
194
|
int value = 0;
|
|
Binary file
|
|
@@ -48,7 +48,7 @@ TEST_CASE("theta intersection: empty", "[theta_intersection]") {
|
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
TEST_CASE("theta intersection: non empty no retained keys", "[theta_intersection]") {
|
|
51
|
-
update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.
|
|
51
|
+
update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001f).build();
|
|
52
52
|
sketch.update(1);
|
|
53
53
|
theta_intersection intersection;
|
|
54
54
|
intersection.update(sketch);
|
|
@@ -174,6 +174,26 @@ TEST_CASE("theta intersection: estimation mode half overlap ordered", "[theta_in
|
|
|
174
174
|
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
175
175
|
}
|
|
176
176
|
|
|
177
|
+
TEST_CASE("theta intersection: estimation mode half overlap ordered wrapped compact", "[theta_intersection]") {
|
|
178
|
+
update_theta_sketch sketch1 = update_theta_sketch::builder().build();
|
|
179
|
+
int value = 0;
|
|
180
|
+
for (int i = 0; i < 10000; i++) sketch1.update(value++);
|
|
181
|
+
auto bytes1 = sketch1.compact().serialize();
|
|
182
|
+
|
|
183
|
+
update_theta_sketch sketch2 = update_theta_sketch::builder().build();
|
|
184
|
+
value = 5000;
|
|
185
|
+
for (int i = 0; i < 10000; i++) sketch2.update(value++);
|
|
186
|
+
auto bytes2 = sketch2.compact().serialize();
|
|
187
|
+
|
|
188
|
+
theta_intersection intersection;
|
|
189
|
+
intersection.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
|
|
190
|
+
intersection.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
|
|
191
|
+
compact_theta_sketch result = intersection.get_result();
|
|
192
|
+
REQUIRE_FALSE(result.is_empty());
|
|
193
|
+
REQUIRE(result.is_estimation_mode());
|
|
194
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
195
|
+
}
|
|
196
|
+
|
|
177
197
|
TEST_CASE("theta intersection: estimation mode disjoint unordered", "[theta_intersection]") {
|
|
178
198
|
update_theta_sketch sketch1 = update_theta_sketch::builder().build();
|
|
179
199
|
int value = 0;
|
|
@@ -100,6 +100,28 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
|
|
|
100
100
|
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
|
101
101
|
}
|
|
102
102
|
|
|
103
|
+
TEST_CASE("theta jaccard: half overlap estimation mode custom seed", "[theta_sketch]") {
|
|
104
|
+
const uint64_t seed = 123;
|
|
105
|
+
auto sk_a = update_theta_sketch::builder().set_seed(seed).build();
|
|
106
|
+
auto sk_b = update_theta_sketch::builder().set_seed(seed).build();
|
|
107
|
+
for (int i = 0; i < 10000; ++i) {
|
|
108
|
+
sk_a.update(i);
|
|
109
|
+
sk_b.update(i + 5000);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// update sketches
|
|
113
|
+
auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b, seed);
|
|
114
|
+
REQUIRE(jc[0] == Approx(0.33).margin(0.01));
|
|
115
|
+
REQUIRE(jc[1] == Approx(0.33).margin(0.01));
|
|
116
|
+
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
|
117
|
+
|
|
118
|
+
// compact sketches
|
|
119
|
+
jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact(), seed);
|
|
120
|
+
REQUIRE(jc[0] == Approx(0.33).margin(0.01));
|
|
121
|
+
REQUIRE(jc[1] == Approx(0.33).margin(0.01));
|
|
122
|
+
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
|
123
|
+
}
|
|
124
|
+
|
|
103
125
|
/**
|
|
104
126
|
* The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
|
|
105
127
|
* underlying sketch is about +/- 1.56%.
|
|
@@ -107,7 +129,7 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
|
|
|
107
129
|
TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
108
130
|
const int8_t min_lg_k = 12;
|
|
109
131
|
const int u1 = 1 << 20;
|
|
110
|
-
const int u2 = u1 * 0.95;
|
|
132
|
+
const int u2 = static_cast<int>(u1 * 0.95);
|
|
111
133
|
const double threshold = 0.943;
|
|
112
134
|
|
|
113
135
|
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
|
@@ -120,6 +142,23 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
|
120
142
|
REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
|
|
121
143
|
}
|
|
122
144
|
|
|
145
|
+
TEST_CASE("theta jaccard: similarity test custom seed", "[theta_sketch]") {
|
|
146
|
+
const int8_t min_lg_k = 12;
|
|
147
|
+
const int u1 = 1 << 20;
|
|
148
|
+
const int u2 = static_cast<int>(u1 * 0.95);
|
|
149
|
+
const double threshold = 0.943;
|
|
150
|
+
const uint64_t seed = 1234;
|
|
151
|
+
|
|
152
|
+
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
|
153
|
+
for (int i = 0; i < u1; ++i) expected.update(i);
|
|
154
|
+
|
|
155
|
+
auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
|
156
|
+
for (int i = 0; i < u2; ++i) actual.update(i);
|
|
157
|
+
|
|
158
|
+
REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold, seed));
|
|
159
|
+
REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold, seed));
|
|
160
|
+
}
|
|
161
|
+
|
|
123
162
|
/**
|
|
124
163
|
* The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
|
|
125
164
|
* as the ratio of intersection to the union becomes a small number.
|
|
@@ -127,7 +166,7 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
|
127
166
|
TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
|
|
128
167
|
const int8_t min_lg_k = 12;
|
|
129
168
|
const int u1 = 1 << 20;
|
|
130
|
-
const int u2 = u1 * 0.05;
|
|
169
|
+
const int u2 = static_cast<int>(u1 * 0.05);
|
|
131
170
|
const double threshold = 0.061;
|
|
132
171
|
|
|
133
172
|
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
|
@@ -140,4 +179,21 @@ TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
|
|
|
140
179
|
REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
|
|
141
180
|
}
|
|
142
181
|
|
|
182
|
+
TEST_CASE("theta jaccard: dissimilarity test custom seed", "[theta_sketch]") {
|
|
183
|
+
const int8_t min_lg_k = 12;
|
|
184
|
+
const int u1 = 1 << 20;
|
|
185
|
+
const int u2 = static_cast<int>(u1 * 0.05);
|
|
186
|
+
const double threshold = 0.061;
|
|
187
|
+
const uint64_t seed = 1234;
|
|
188
|
+
|
|
189
|
+
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
|
190
|
+
for (int i = 0; i < u1; ++i) expected.update(i);
|
|
191
|
+
|
|
192
|
+
auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
|
193
|
+
for (int i = 0; i < u2; ++i) actual.update(i);
|
|
194
|
+
|
|
195
|
+
REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold, seed));
|
|
196
|
+
REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold, seed));
|
|
197
|
+
}
|
|
198
|
+
|
|
143
199
|
} /* namespace datasketches */
|