datasketches 0.2.0 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -23,8 +23,8 @@
|
|
23
23
|
namespace datasketches {
|
24
24
|
|
25
25
|
template<typename A>
|
26
|
-
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
|
27
|
-
state_(lg_cur_size, lg_nom_size, rf, theta, seed,
|
26
|
+
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator):
|
27
|
+
state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
|
28
28
|
{}
|
29
29
|
|
30
30
|
template<typename A>
|
@@ -38,14 +38,17 @@ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
|
|
38
38
|
return state_.get_result(ordered);
|
39
39
|
}
|
40
40
|
|
41
|
+
template<typename A>
|
42
|
+
void theta_union_alloc<A>::reset() {
|
43
|
+
state_.reset();
|
44
|
+
}
|
45
|
+
|
41
46
|
template<typename A>
|
42
47
|
theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
|
43
48
|
|
44
49
|
template<typename A>
|
45
50
|
auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
|
46
|
-
return theta_union_alloc(
|
47
|
-
this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
|
48
|
-
this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
51
|
+
return theta_union_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
|
49
52
|
}
|
50
53
|
|
51
54
|
} /* namespace datasketches */
|
@@ -40,8 +40,8 @@ struct theta_update_sketch_base {
|
|
40
40
|
using resize_factor = theta_constants::resize_factor;
|
41
41
|
using comparator = compare_by_key<ExtractKey>;
|
42
42
|
|
43
|
-
theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
44
|
-
uint64_t seed, const Allocator& allocator, bool is_empty = true);
|
43
|
+
theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
|
44
|
+
uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
|
45
45
|
theta_update_sketch_base(const theta_update_sketch_base& other);
|
46
46
|
theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
|
47
47
|
~theta_update_sketch_base();
|
@@ -53,6 +53,8 @@ struct theta_update_sketch_base {
|
|
53
53
|
inline uint64_t hash_and_screen(const void* data, size_t length);
|
54
54
|
|
55
55
|
inline std::pair<iterator, bool> find(uint64_t key) const;
|
56
|
+
static inline std::pair<iterator, bool> find(Entry* entries, uint8_t lg_size, uint64_t key);
|
57
|
+
|
56
58
|
|
57
59
|
template<typename FwdEntry>
|
58
60
|
inline void insert(iterator it, FwdEntry&& entry);
|
@@ -73,6 +75,7 @@ struct theta_update_sketch_base {
|
|
73
75
|
uint8_t lg_cur_size_;
|
74
76
|
uint8_t lg_nom_size_;
|
75
77
|
resize_factor rf_;
|
78
|
+
float p_;
|
76
79
|
uint32_t num_entries_;
|
77
80
|
uint64_t theta_;
|
78
81
|
uint64_t seed_;
|
@@ -81,6 +84,7 @@ struct theta_update_sketch_base {
|
|
81
84
|
void resize();
|
82
85
|
void rebuild();
|
83
86
|
void trim();
|
87
|
+
void reset();
|
84
88
|
|
85
89
|
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
|
86
90
|
static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
|
@@ -92,11 +96,14 @@ struct theta_update_sketch_base {
|
|
92
96
|
template<typename Derived, typename Allocator>
|
93
97
|
class theta_base_builder {
|
94
98
|
public:
|
99
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
95
100
|
using resize_factor = theta_constants::resize_factor;
|
96
101
|
static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
|
97
102
|
static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
|
98
|
-
|
99
|
-
|
103
|
+
// TODO: The following defaults are redundant and deprecated. Will be removed in the
|
104
|
+
// next major version release
|
105
|
+
static const uint8_t DEFAULT_LG_K = theta_constants::DEFAULT_LG_K;
|
106
|
+
static const resize_factor DEFAULT_RESIZE_FACTOR = theta_constants::DEFAULT_RESIZE_FACTOR;
|
100
107
|
|
101
108
|
/**
|
102
109
|
* Creates and instance of the builder with default parameters.
|
@@ -144,7 +151,6 @@ protected:
|
|
144
151
|
|
145
152
|
uint64_t starting_theta() const;
|
146
153
|
uint8_t starting_lg_size() const;
|
147
|
-
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
|
148
154
|
};
|
149
155
|
|
150
156
|
// key extractor
|
@@ -24,22 +24,25 @@
|
|
24
24
|
#include <sstream>
|
25
25
|
#include <algorithm>
|
26
26
|
|
27
|
+
#include "theta_helpers.hpp"
|
28
|
+
|
27
29
|
namespace datasketches {
|
28
30
|
|
29
31
|
template<typename EN, typename EK, typename A>
|
30
|
-
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
|
32
|
+
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
|
31
33
|
allocator_(allocator),
|
32
34
|
is_empty_(is_empty),
|
33
35
|
lg_cur_size_(lg_cur_size),
|
34
36
|
lg_nom_size_(lg_nom_size),
|
35
37
|
rf_(rf),
|
38
|
+
p_(p),
|
36
39
|
num_entries_(0),
|
37
40
|
theta_(theta),
|
38
41
|
seed_(seed),
|
39
42
|
entries_(nullptr)
|
40
43
|
{
|
41
44
|
if (lg_cur_size > 0) {
|
42
|
-
const size_t size =
|
45
|
+
const size_t size = 1ULL << lg_cur_size;
|
43
46
|
entries_ = allocator_.allocate(size);
|
44
47
|
for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
|
45
48
|
}
|
@@ -52,13 +55,14 @@ is_empty_(other.is_empty_),
|
|
52
55
|
lg_cur_size_(other.lg_cur_size_),
|
53
56
|
lg_nom_size_(other.lg_nom_size_),
|
54
57
|
rf_(other.rf_),
|
58
|
+
p_(other.p_),
|
55
59
|
num_entries_(other.num_entries_),
|
56
60
|
theta_(other.theta_),
|
57
61
|
seed_(other.seed_),
|
58
62
|
entries_(nullptr)
|
59
63
|
{
|
60
64
|
if (other.entries_ != nullptr) {
|
61
|
-
const size_t size =
|
65
|
+
const size_t size = 1ULL << lg_cur_size_;
|
62
66
|
entries_ = allocator_.allocate(size);
|
63
67
|
for (size_t i = 0; i < size; ++i) {
|
64
68
|
if (EK()(other.entries_[i]) != 0) {
|
@@ -77,6 +81,7 @@ is_empty_(other.is_empty_),
|
|
77
81
|
lg_cur_size_(other.lg_cur_size_),
|
78
82
|
lg_nom_size_(other.lg_nom_size_),
|
79
83
|
rf_(other.rf_),
|
84
|
+
p_(other.p_),
|
80
85
|
num_entries_(other.num_entries_),
|
81
86
|
theta_(other.theta_),
|
82
87
|
seed_(other.seed_),
|
@@ -89,7 +94,7 @@ template<typename EN, typename EK, typename A>
|
|
89
94
|
theta_update_sketch_base<EN, EK, A>::~theta_update_sketch_base()
|
90
95
|
{
|
91
96
|
if (entries_ != nullptr) {
|
92
|
-
const size_t size =
|
97
|
+
const size_t size = 1ULL << lg_cur_size_;
|
93
98
|
for (size_t i = 0; i < size; ++i) {
|
94
99
|
if (EK()(entries_[i]) != 0) entries_[i].~EN();
|
95
100
|
}
|
@@ -105,6 +110,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
|
|
105
110
|
std::swap(lg_cur_size_, copy.lg_cur_size_);
|
106
111
|
std::swap(lg_nom_size_, copy.lg_nom_size_);
|
107
112
|
std::swap(rf_, copy.rf_);
|
113
|
+
std::swap(p_, copy.p_);
|
108
114
|
std::swap(num_entries_, copy.num_entries_);
|
109
115
|
std::swap(theta_, copy.theta_);
|
110
116
|
std::swap(seed_, copy.seed_);
|
@@ -119,6 +125,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
|
|
119
125
|
std::swap(lg_cur_size_, other.lg_cur_size_);
|
120
126
|
std::swap(lg_nom_size_, other.lg_nom_size_);
|
121
127
|
std::swap(rf_, other.rf_);
|
128
|
+
std::swap(p_, other.p_);
|
122
129
|
std::swap(num_entries_, other.num_entries_);
|
123
130
|
std::swap(theta_, other.theta_);
|
124
131
|
std::swap(seed_, other.seed_);
|
@@ -136,18 +143,23 @@ uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data,
|
|
136
143
|
|
137
144
|
template<typename EN, typename EK, typename A>
|
138
145
|
auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
|
139
|
-
|
140
|
-
|
141
|
-
|
146
|
+
return find(entries_, lg_cur_size_, key);
|
147
|
+
}
|
148
|
+
|
149
|
+
template<typename EN, typename EK, typename A>
|
150
|
+
auto theta_update_sketch_base<EN, EK, A>::find(EN* entries, uint8_t lg_size, uint64_t key) -> std::pair<iterator, bool> {
|
151
|
+
const uint32_t size = 1 << lg_size;
|
152
|
+
const uint32_t mask = size - 1;
|
153
|
+
const uint32_t stride = get_stride(key, lg_size);
|
142
154
|
uint32_t index = static_cast<uint32_t>(key) & mask;
|
143
155
|
// search for duplicate or zero
|
144
156
|
const uint32_t loop_index = index;
|
145
157
|
do {
|
146
|
-
const uint64_t probe = EK()(
|
158
|
+
const uint64_t probe = EK()(entries[index]);
|
147
159
|
if (probe == 0) {
|
148
|
-
return std::pair<iterator, bool>(&
|
160
|
+
return std::pair<iterator, bool>(&entries[index], false);
|
149
161
|
} else if (probe == key) {
|
150
|
-
return std::pair<iterator, bool>(&
|
162
|
+
return std::pair<iterator, bool>(&entries[index], true);
|
151
163
|
}
|
152
164
|
index = (index + stride) & mask;
|
153
165
|
} while (index != loop_index);
|
@@ -175,13 +187,13 @@ auto theta_update_sketch_base<EN, EK, A>::begin() const -> iterator {
|
|
175
187
|
|
176
188
|
template<typename EN, typename EK, typename A>
|
177
189
|
auto theta_update_sketch_base<EN, EK, A>::end() const -> iterator {
|
178
|
-
return &entries_[
|
190
|
+
return &entries_[1ULL << lg_cur_size_];
|
179
191
|
}
|
180
192
|
|
181
193
|
template<typename EN, typename EK, typename A>
|
182
194
|
uint32_t theta_update_sketch_base<EN, EK, A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
|
183
195
|
const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
|
184
|
-
return std::floor(fraction * (1 << lg_cur_size));
|
196
|
+
return static_cast<uint32_t>(std::floor(fraction * (1 << lg_cur_size)));
|
185
197
|
}
|
186
198
|
|
187
199
|
template<typename EN, typename EK, typename A>
|
@@ -192,29 +204,29 @@ uint32_t theta_update_sketch_base<EN, EK, A>::get_stride(uint64_t key, uint8_t l
|
|
192
204
|
|
193
205
|
template<typename EN, typename EK, typename A>
|
194
206
|
void theta_update_sketch_base<EN, EK, A>::resize() {
|
195
|
-
const size_t old_size =
|
196
|
-
const uint8_t
|
197
|
-
const
|
198
|
-
|
199
|
-
|
200
|
-
EN* old_entries = entries_;
|
201
|
-
entries_ = allocator_.allocate(new_size);
|
202
|
-
for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
|
203
|
-
num_entries_ = 0;
|
207
|
+
const size_t old_size = 1ULL << lg_cur_size_;
|
208
|
+
const uint8_t lg_new_size = std::min<uint8_t>(lg_cur_size_ + static_cast<uint8_t>(rf_), lg_nom_size_ + 1);
|
209
|
+
const size_t new_size = 1ULL << lg_new_size;
|
210
|
+
EN* new_entries = allocator_.allocate(new_size);
|
211
|
+
for (size_t i = 0; i < new_size; ++i) EK()(new_entries[i]) = 0;
|
204
212
|
for (size_t i = 0; i < old_size; ++i) {
|
205
|
-
const uint64_t key = EK()(
|
213
|
+
const uint64_t key = EK()(entries_[i]);
|
206
214
|
if (key != 0) {
|
207
|
-
|
208
|
-
|
215
|
+
// always finds an empty slot in a larger table
|
216
|
+
new (find(new_entries, lg_new_size, key).first) EN(std::move(entries_[i]));
|
217
|
+
entries_[i].~EN();
|
218
|
+
EK()(entries_[i]) = 0;
|
209
219
|
}
|
210
220
|
}
|
211
|
-
|
221
|
+
std::swap(entries_, new_entries);
|
222
|
+
lg_cur_size_ = lg_new_size;
|
223
|
+
allocator_.deallocate(new_entries, old_size);
|
212
224
|
}
|
213
225
|
|
214
226
|
// assumes number of entries > nominal size
|
215
227
|
template<typename EN, typename EK, typename A>
|
216
228
|
void theta_update_sketch_base<EN, EK, A>::rebuild() {
|
217
|
-
const size_t size =
|
229
|
+
const size_t size = 1ULL << lg_cur_size_;
|
218
230
|
const uint32_t nominal_size = 1 << lg_nom_size_;
|
219
231
|
|
220
232
|
// empty entries have uninitialized payloads
|
@@ -227,10 +239,10 @@ void theta_update_sketch_base<EN, EK, A>::rebuild() {
|
|
227
239
|
const size_t num_old_entries = num_entries_;
|
228
240
|
entries_ = allocator_.allocate(size);
|
229
241
|
for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
|
230
|
-
num_entries_ =
|
242
|
+
num_entries_ = nominal_size;
|
231
243
|
// relies on consolidating non-empty entries to the front
|
232
244
|
for (size_t i = 0; i < nominal_size; ++i) {
|
233
|
-
|
245
|
+
new (find(EK()(old_entries[i])).first) EN(std::move(old_entries[i]));
|
234
246
|
old_entries[i].~EN();
|
235
247
|
}
|
236
248
|
for (size_t i = nominal_size; i < num_old_entries; ++i) old_entries[i].~EN();
|
@@ -242,6 +254,29 @@ void theta_update_sketch_base<EN, EK, A>::trim() {
|
|
242
254
|
if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
|
243
255
|
}
|
244
256
|
|
257
|
+
template<typename EN, typename EK, typename A>
|
258
|
+
void theta_update_sketch_base<EN, EK, A>::reset() {
|
259
|
+
const size_t cur_size = 1ULL << lg_cur_size_;
|
260
|
+
for (size_t i = 0; i < cur_size; ++i) {
|
261
|
+
if (EK()(entries_[i]) != 0) {
|
262
|
+
entries_[i].~EN();
|
263
|
+
EK()(entries_[i]) = 0;
|
264
|
+
}
|
265
|
+
}
|
266
|
+
const uint8_t starting_lg_size = theta_build_helper<true>::starting_sub_multiple(
|
267
|
+
lg_nom_size_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
|
268
|
+
if (starting_lg_size != lg_cur_size_) {
|
269
|
+
allocator_.deallocate(entries_, cur_size);
|
270
|
+
lg_cur_size_ = starting_lg_size;
|
271
|
+
const size_t new_size = 1ULL << starting_lg_size;
|
272
|
+
entries_ = allocator_.allocate(new_size);
|
273
|
+
for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
|
274
|
+
}
|
275
|
+
num_entries_ = 0;
|
276
|
+
theta_ = theta_build_helper<true>::starting_theta_from_p(p_);
|
277
|
+
is_empty_ = true;
|
278
|
+
}
|
279
|
+
|
245
280
|
template<typename EN, typename EK, typename A>
|
246
281
|
void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
|
247
282
|
// find the first empty slot
|
@@ -266,7 +301,11 @@ void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, siz
|
|
266
301
|
|
267
302
|
template<typename Derived, typename Allocator>
|
268
303
|
theta_base_builder<Derived, Allocator>::theta_base_builder(const Allocator& allocator):
|
269
|
-
allocator_(allocator),
|
304
|
+
allocator_(allocator),
|
305
|
+
lg_k_(theta_constants::DEFAULT_LG_K),
|
306
|
+
rf_(theta_constants::DEFAULT_RESIZE_FACTOR),
|
307
|
+
p_(1),
|
308
|
+
seed_(DEFAULT_SEED) {}
|
270
309
|
|
271
310
|
template<typename Derived, typename Allocator>
|
272
311
|
Derived& theta_base_builder<Derived, Allocator>::set_lg_k(uint8_t lg_k) {
|
@@ -301,18 +340,12 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
|
|
301
340
|
|
302
341
|
template<typename Derived, typename Allocator>
|
303
342
|
uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
|
304
|
-
|
305
|
-
return theta_constants::MAX_THETA;
|
343
|
+
return theta_build_helper<true>::starting_theta_from_p(p_);
|
306
344
|
}
|
307
345
|
|
308
346
|
template<typename Derived, typename Allocator>
|
309
347
|
uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
|
310
|
-
return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
311
|
-
}
|
312
|
-
|
313
|
-
template<typename Derived, typename Allocator>
|
314
|
-
uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
315
|
-
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
348
|
+
return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
316
349
|
}
|
317
350
|
|
318
351
|
// iterator
|
@@ -37,7 +37,7 @@ TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
|
|
37
37
|
TEST_CASE("theta a-not-b: non empty no retained keys", "[theta_a_not_b]") {
|
38
38
|
update_theta_sketch a = update_theta_sketch::builder().build();
|
39
39
|
a.update(1);
|
40
|
-
update_theta_sketch b = update_theta_sketch::builder().set_p(0.
|
40
|
+
update_theta_sketch b = update_theta_sketch::builder().set_p(0.001f).build();
|
41
41
|
theta_a_not_b a_not_b;
|
42
42
|
|
43
43
|
// B is still empty
|
@@ -167,6 +167,28 @@ TEST_CASE("theta a-not-b: estimation mode half overlap", "[theta_a_not_b]") {
|
|
167
167
|
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
168
168
|
}
|
169
169
|
|
170
|
+
TEST_CASE("theta a-not-b: estimation mode half overlap wrapped compact", "[theta_a_not_b]") {
|
171
|
+
update_theta_sketch a = update_theta_sketch::builder().build();
|
172
|
+
int value = 0;
|
173
|
+
for (int i = 0; i < 10000; i++) a.update(value++);
|
174
|
+
auto bytes_a = a.compact().serialize();
|
175
|
+
|
176
|
+
update_theta_sketch b = update_theta_sketch::builder().build();
|
177
|
+
value = 5000;
|
178
|
+
for (int i = 0; i < 10000; i++) b.update(value++);
|
179
|
+
auto bytes_b = b.compact().serialize();
|
180
|
+
|
181
|
+
theta_a_not_b a_not_b;
|
182
|
+
|
183
|
+
auto result = a_not_b.compute(
|
184
|
+
wrapped_compact_theta_sketch::wrap(bytes_a.data(), bytes_a.size()),
|
185
|
+
wrapped_compact_theta_sketch::wrap(bytes_b.data(), bytes_b.size())
|
186
|
+
);
|
187
|
+
REQUIRE_FALSE(result.is_empty());
|
188
|
+
REQUIRE(result.is_estimation_mode());
|
189
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
190
|
+
}
|
191
|
+
|
170
192
|
TEST_CASE("theta a-not-b: estimation mode disjoint", "[theta_a_not_b]") {
|
171
193
|
update_theta_sketch a = update_theta_sketch::builder().build();
|
172
194
|
int value = 0;
|
Binary file
|
@@ -48,7 +48,7 @@ TEST_CASE("theta intersection: empty", "[theta_intersection]") {
|
|
48
48
|
}
|
49
49
|
|
50
50
|
TEST_CASE("theta intersection: non empty no retained keys", "[theta_intersection]") {
|
51
|
-
update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.
|
51
|
+
update_theta_sketch sketch = update_theta_sketch::builder().set_p(0.001f).build();
|
52
52
|
sketch.update(1);
|
53
53
|
theta_intersection intersection;
|
54
54
|
intersection.update(sketch);
|
@@ -174,6 +174,26 @@ TEST_CASE("theta intersection: estimation mode half overlap ordered", "[theta_in
|
|
174
174
|
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
175
175
|
}
|
176
176
|
|
177
|
+
TEST_CASE("theta intersection: estimation mode half overlap ordered wrapped compact", "[theta_intersection]") {
|
178
|
+
update_theta_sketch sketch1 = update_theta_sketch::builder().build();
|
179
|
+
int value = 0;
|
180
|
+
for (int i = 0; i < 10000; i++) sketch1.update(value++);
|
181
|
+
auto bytes1 = sketch1.compact().serialize();
|
182
|
+
|
183
|
+
update_theta_sketch sketch2 = update_theta_sketch::builder().build();
|
184
|
+
value = 5000;
|
185
|
+
for (int i = 0; i < 10000; i++) sketch2.update(value++);
|
186
|
+
auto bytes2 = sketch2.compact().serialize();
|
187
|
+
|
188
|
+
theta_intersection intersection;
|
189
|
+
intersection.update(wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size()));
|
190
|
+
intersection.update(wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size()));
|
191
|
+
compact_theta_sketch result = intersection.get_result();
|
192
|
+
REQUIRE_FALSE(result.is_empty());
|
193
|
+
REQUIRE(result.is_estimation_mode());
|
194
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
195
|
+
}
|
196
|
+
|
177
197
|
TEST_CASE("theta intersection: estimation mode disjoint unordered", "[theta_intersection]") {
|
178
198
|
update_theta_sketch sketch1 = update_theta_sketch::builder().build();
|
179
199
|
int value = 0;
|
@@ -100,6 +100,28 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
|
|
100
100
|
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
101
101
|
}
|
102
102
|
|
103
|
+
TEST_CASE("theta jaccard: half overlap estimation mode custom seed", "[theta_sketch]") {
|
104
|
+
const uint64_t seed = 123;
|
105
|
+
auto sk_a = update_theta_sketch::builder().set_seed(seed).build();
|
106
|
+
auto sk_b = update_theta_sketch::builder().set_seed(seed).build();
|
107
|
+
for (int i = 0; i < 10000; ++i) {
|
108
|
+
sk_a.update(i);
|
109
|
+
sk_b.update(i + 5000);
|
110
|
+
}
|
111
|
+
|
112
|
+
// update sketches
|
113
|
+
auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b, seed);
|
114
|
+
REQUIRE(jc[0] == Approx(0.33).margin(0.01));
|
115
|
+
REQUIRE(jc[1] == Approx(0.33).margin(0.01));
|
116
|
+
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
117
|
+
|
118
|
+
// compact sketches
|
119
|
+
jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact(), seed);
|
120
|
+
REQUIRE(jc[0] == Approx(0.33).margin(0.01));
|
121
|
+
REQUIRE(jc[1] == Approx(0.33).margin(0.01));
|
122
|
+
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
123
|
+
}
|
124
|
+
|
103
125
|
/**
|
104
126
|
* The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
|
105
127
|
* underlying sketch is about +/- 1.56%.
|
@@ -107,7 +129,7 @@ TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
|
|
107
129
|
TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
108
130
|
const int8_t min_lg_k = 12;
|
109
131
|
const int u1 = 1 << 20;
|
110
|
-
const int u2 = u1 * 0.95;
|
132
|
+
const int u2 = static_cast<int>(u1 * 0.95);
|
111
133
|
const double threshold = 0.943;
|
112
134
|
|
113
135
|
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
@@ -120,6 +142,23 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
120
142
|
REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
|
121
143
|
}
|
122
144
|
|
145
|
+
TEST_CASE("theta jaccard: similarity test custom seed", "[theta_sketch]") {
|
146
|
+
const int8_t min_lg_k = 12;
|
147
|
+
const int u1 = 1 << 20;
|
148
|
+
const int u2 = static_cast<int>(u1 * 0.95);
|
149
|
+
const double threshold = 0.943;
|
150
|
+
const uint64_t seed = 1234;
|
151
|
+
|
152
|
+
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
153
|
+
for (int i = 0; i < u1; ++i) expected.update(i);
|
154
|
+
|
155
|
+
auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
156
|
+
for (int i = 0; i < u2; ++i) actual.update(i);
|
157
|
+
|
158
|
+
REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold, seed));
|
159
|
+
REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold, seed));
|
160
|
+
}
|
161
|
+
|
123
162
|
/**
|
124
163
|
* The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
|
125
164
|
* as the ratio of intersection to the union becomes a small number.
|
@@ -127,7 +166,7 @@ TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
127
166
|
TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
|
128
167
|
const int8_t min_lg_k = 12;
|
129
168
|
const int u1 = 1 << 20;
|
130
|
-
const int u2 = u1 * 0.05;
|
169
|
+
const int u2 = static_cast<int>(u1 * 0.05);
|
131
170
|
const double threshold = 0.061;
|
132
171
|
|
133
172
|
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
@@ -140,4 +179,21 @@ TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
|
|
140
179
|
REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
|
141
180
|
}
|
142
181
|
|
182
|
+
TEST_CASE("theta jaccard: dissimilarity test custom seed", "[theta_sketch]") {
|
183
|
+
const int8_t min_lg_k = 12;
|
184
|
+
const int u1 = 1 << 20;
|
185
|
+
const int u2 = static_cast<int>(u1 * 0.05);
|
186
|
+
const double threshold = 0.061;
|
187
|
+
const uint64_t seed = 1234;
|
188
|
+
|
189
|
+
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
190
|
+
for (int i = 0; i < u1; ++i) expected.update(i);
|
191
|
+
|
192
|
+
auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).set_seed(seed).build();
|
193
|
+
for (int i = 0; i < u2; ++i) actual.update(i);
|
194
|
+
|
195
|
+
REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold, seed));
|
196
|
+
REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold, seed));
|
197
|
+
}
|
198
|
+
|
143
199
|
} /* namespace datasketches */
|