datasketches 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
|
File without changes
|
|
File without changes
|
|
@@ -20,29 +20,28 @@
|
|
|
20
20
|
#ifndef THETA_INTERSECTION_HPP_
|
|
21
21
|
#define THETA_INTERSECTION_HPP_
|
|
22
22
|
|
|
23
|
-
#include <memory>
|
|
24
|
-
#include <functional>
|
|
25
|
-
#include <climits>
|
|
26
|
-
|
|
27
23
|
#include "theta_sketch.hpp"
|
|
28
|
-
#include "
|
|
24
|
+
#include "theta_intersection_base.hpp"
|
|
29
25
|
|
|
30
26
|
namespace datasketches {
|
|
31
27
|
|
|
32
|
-
|
|
33
|
-
* author Alexander Saydakov
|
|
34
|
-
* author Lee Rhodes
|
|
35
|
-
* author Kevin Lang
|
|
36
|
-
*/
|
|
37
|
-
|
|
38
|
-
template<typename A>
|
|
28
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
39
29
|
class theta_intersection_alloc {
|
|
40
30
|
public:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
31
|
+
using Entry = uint64_t;
|
|
32
|
+
using ExtractKey = trivial_extract_key;
|
|
33
|
+
using Sketch = theta_sketch_alloc<Allocator>;
|
|
34
|
+
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
|
35
|
+
|
|
36
|
+
struct pass_through_policy {
|
|
37
|
+
uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
|
|
38
|
+
unused(incoming_entry);
|
|
39
|
+
return internal_entry;
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
|
|
43
|
+
|
|
44
|
+
explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
46
45
|
|
|
47
46
|
/**
|
|
48
47
|
* Updates the intersection with a given sketch.
|
|
@@ -50,7 +49,8 @@ public:
|
|
|
50
49
|
* can reduce the current set to leave the overlapping subset only.
|
|
51
50
|
* @param sketch represents input set for the intersection
|
|
52
51
|
*/
|
|
53
|
-
|
|
52
|
+
template<typename FwdSketch>
|
|
53
|
+
void update(FwdSketch&& sketch);
|
|
54
54
|
|
|
55
55
|
/**
|
|
56
56
|
* Produces a copy of the current state of the intersection.
|
|
@@ -59,7 +59,7 @@ public:
|
|
|
59
59
|
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
60
60
|
* @return the result of the intersection
|
|
61
61
|
*/
|
|
62
|
-
|
|
62
|
+
CompactSketch get_result(bool ordered = true) const;
|
|
63
63
|
|
|
64
64
|
/**
|
|
65
65
|
* Returns true if the state of the intersection is defined (not infinite "universe").
|
|
@@ -68,21 +68,14 @@ public:
|
|
|
68
68
|
bool has_result() const;
|
|
69
69
|
|
|
70
70
|
private:
|
|
71
|
-
|
|
72
|
-
bool is_valid_;
|
|
73
|
-
bool is_empty_;
|
|
74
|
-
uint64_t theta_;
|
|
75
|
-
uint8_t lg_size_;
|
|
76
|
-
vector_u64<A> keys_;
|
|
77
|
-
uint32_t num_keys_;
|
|
78
|
-
uint16_t seed_hash_;
|
|
71
|
+
State state_;
|
|
79
72
|
};
|
|
80
73
|
|
|
81
74
|
// alias with default allocator for convenience
|
|
82
|
-
|
|
75
|
+
using theta_intersection = theta_intersection_alloc<std::allocator<uint64_t>>;
|
|
83
76
|
|
|
84
77
|
} /* namespace datasketches */
|
|
85
78
|
|
|
86
79
|
#include "theta_intersection_impl.hpp"
|
|
87
80
|
|
|
88
|
-
#
|
|
81
|
+
#endif
|
|
File without changes
|
|
File without changes
|
|
@@ -20,109 +20,27 @@
|
|
|
20
20
|
#ifndef THETA_INTERSECTION_IMPL_HPP_
|
|
21
21
|
#define THETA_INTERSECTION_IMPL_HPP_
|
|
22
22
|
|
|
23
|
-
#include <algorithm>
|
|
24
|
-
|
|
25
23
|
namespace datasketches {
|
|
26
24
|
|
|
27
|
-
/*
|
|
28
|
-
* author Alexander Saydakov
|
|
29
|
-
* author Lee Rhodes
|
|
30
|
-
* author Kevin Lang
|
|
31
|
-
*/
|
|
32
|
-
|
|
33
25
|
template<typename A>
|
|
34
|
-
theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed):
|
|
35
|
-
|
|
36
|
-
is_empty_(false),
|
|
37
|
-
theta_(theta_sketch_alloc<A>::MAX_THETA),
|
|
38
|
-
lg_size_(0),
|
|
39
|
-
keys_(),
|
|
40
|
-
num_keys_(0),
|
|
41
|
-
seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
|
|
26
|
+
theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
|
|
27
|
+
state_(seed, pass_through_policy(), allocator)
|
|
42
28
|
{}
|
|
43
29
|
|
|
44
30
|
template<typename A>
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
is_empty_ |= sketch.is_empty();
|
|
49
|
-
theta_ = std::min(theta_, sketch.get_theta64());
|
|
50
|
-
if (is_valid_ && num_keys_ == 0) return;
|
|
51
|
-
if (sketch.get_num_retained() == 0) {
|
|
52
|
-
is_valid_ = true;
|
|
53
|
-
if (keys_.size() > 0) {
|
|
54
|
-
keys_.resize(0);
|
|
55
|
-
lg_size_ = 0;
|
|
56
|
-
num_keys_ = 0;
|
|
57
|
-
}
|
|
58
|
-
return;
|
|
59
|
-
}
|
|
60
|
-
if (!is_valid_) { // first update, clone incoming sketch
|
|
61
|
-
is_valid_ = true;
|
|
62
|
-
lg_size_ = lg_size_from_count(sketch.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
|
|
63
|
-
keys_.resize(1 << lg_size_, 0);
|
|
64
|
-
for (auto key: sketch) {
|
|
65
|
-
if (!update_theta_sketch_alloc<A>::hash_search_or_insert(key, keys_.data(), lg_size_)) {
|
|
66
|
-
throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
|
|
67
|
-
}
|
|
68
|
-
++num_keys_;
|
|
69
|
-
}
|
|
70
|
-
if (num_keys_ != sketch.get_num_retained()) throw std::invalid_argument("num keys mismatch, possibly corrupted input sketch");
|
|
71
|
-
} else { // intersection
|
|
72
|
-
const uint32_t max_matches = std::min(num_keys_, sketch.get_num_retained());
|
|
73
|
-
vector_u64<A> matched_keys(max_matches);
|
|
74
|
-
uint32_t match_count = 0;
|
|
75
|
-
uint32_t count = 0;
|
|
76
|
-
for (auto key: sketch) {
|
|
77
|
-
if (key < theta_) {
|
|
78
|
-
if (update_theta_sketch_alloc<A>::hash_search(key, keys_.data(), lg_size_)) {
|
|
79
|
-
if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
|
|
80
|
-
matched_keys[match_count++] = key;
|
|
81
|
-
}
|
|
82
|
-
} else if (sketch.is_ordered()) {
|
|
83
|
-
break; // early stop
|
|
84
|
-
}
|
|
85
|
-
++count;
|
|
86
|
-
}
|
|
87
|
-
if (count > sketch.get_num_retained()) {
|
|
88
|
-
throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
|
|
89
|
-
} else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
|
|
90
|
-
throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
|
|
91
|
-
}
|
|
92
|
-
if (match_count == 0) {
|
|
93
|
-
keys_.resize(0);
|
|
94
|
-
lg_size_ = 0;
|
|
95
|
-
num_keys_ = 0;
|
|
96
|
-
if (theta_ == theta_sketch_alloc<A>::MAX_THETA) is_empty_ = true;
|
|
97
|
-
} else {
|
|
98
|
-
const uint8_t lg_size = lg_size_from_count(match_count, update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
|
|
99
|
-
if (lg_size != lg_size_) {
|
|
100
|
-
lg_size_ = lg_size;
|
|
101
|
-
keys_.resize(1 << lg_size_);
|
|
102
|
-
}
|
|
103
|
-
std::fill(keys_.begin(), keys_.end(), 0);
|
|
104
|
-
for (uint32_t i = 0; i < match_count; i++) {
|
|
105
|
-
update_theta_sketch_alloc<A>::hash_search_or_insert(matched_keys[i], keys_.data(), lg_size_);
|
|
106
|
-
}
|
|
107
|
-
num_keys_ = match_count;
|
|
108
|
-
}
|
|
109
|
-
}
|
|
31
|
+
template<typename SS>
|
|
32
|
+
void theta_intersection_alloc<A>::update(SS&& sketch) {
|
|
33
|
+
state_.update(std::forward<SS>(sketch));
|
|
110
34
|
}
|
|
111
35
|
|
|
112
36
|
template<typename A>
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
vector_u64<A> keys(num_keys_);
|
|
116
|
-
if (num_keys_ > 0) {
|
|
117
|
-
std::copy_if(keys_.begin(), keys_.end(), keys.begin(), [](uint64_t key) { return key != 0; });
|
|
118
|
-
if (ordered) std::sort(keys.begin(), keys.end());
|
|
119
|
-
}
|
|
120
|
-
return compact_theta_sketch_alloc<A>(is_empty_, theta_, std::move(keys), seed_hash_, ordered);
|
|
37
|
+
auto theta_intersection_alloc<A>::get_result(bool ordered) const -> CompactSketch {
|
|
38
|
+
return state_.get_result(ordered);
|
|
121
39
|
}
|
|
122
40
|
|
|
123
41
|
template<typename A>
|
|
124
42
|
bool theta_intersection_alloc<A>::has_result() const {
|
|
125
|
-
return
|
|
43
|
+
return state_.has_result();
|
|
126
44
|
}
|
|
127
45
|
|
|
128
46
|
} /* namespace datasketches */
|
|
@@ -17,28 +17,21 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
#
|
|
20
|
+
#ifndef THETA_JACCARD_SIMILARITY_HPP_
|
|
21
|
+
#define THETA_JACCARD_SIMILARITY_HPP_
|
|
21
22
|
|
|
22
|
-
#include
|
|
23
|
-
#include
|
|
24
|
-
|
|
25
|
-
#include <theta_union_experimental.hpp>
|
|
23
|
+
#include "theta_jaccard_similarity_base.hpp"
|
|
24
|
+
#include "theta_union.hpp"
|
|
25
|
+
#include "theta_intersection.hpp"
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
update_sketch1.update(1);
|
|
32
|
-
update_sketch1.update(2);
|
|
33
|
-
|
|
34
|
-
auto update_sketch2 = update_theta_sketch_experimental<>::builder().build();
|
|
35
|
-
update_sketch2.update(1);
|
|
36
|
-
update_sketch2.update(3);
|
|
29
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
30
|
+
using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_alloc<Allocator>, theta_intersection_alloc<Allocator>, trivial_extract_key>;
|
|
37
31
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
u.update(update_sketch2);
|
|
41
|
-
auto r = u.get_result();
|
|
42
|
-
}
|
|
32
|
+
// alias with default allocator for convenience
|
|
33
|
+
using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
|
|
43
34
|
|
|
44
35
|
} /* namespace datasketches */
|
|
36
|
+
|
|
37
|
+
# endif
|
|
@@ -17,19 +17,16 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
#ifndef
|
|
21
|
-
#define
|
|
20
|
+
#ifndef THETA_JACCARD_SIMILARITY_BASE_HPP_
|
|
21
|
+
#define THETA_JACCARD_SIMILARITY_BASE_HPP_
|
|
22
22
|
|
|
23
23
|
#include <memory>
|
|
24
24
|
#include <array>
|
|
25
25
|
|
|
26
|
-
#include
|
|
27
|
-
#include
|
|
28
|
-
#include
|
|
29
|
-
#include
|
|
30
|
-
#include <bounds_on_ratios_in_theta_sketched_sets.hpp>
|
|
31
|
-
#include <ceiling_power_of_2.hpp>
|
|
32
|
-
#include <common_defs.hpp>
|
|
26
|
+
#include "theta_constants.hpp"
|
|
27
|
+
#include "bounds_on_ratios_in_theta_sketched_sets.hpp"
|
|
28
|
+
#include "ceiling_power_of_2.hpp"
|
|
29
|
+
#include "common_defs.hpp"
|
|
33
30
|
|
|
34
31
|
namespace datasketches {
|
|
35
32
|
|
|
@@ -154,19 +151,6 @@ private:
|
|
|
154
151
|
|
|
155
152
|
};
|
|
156
153
|
|
|
157
|
-
template<typename Allocator>
|
|
158
|
-
using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_experimental<Allocator>, theta_intersection_experimental<Allocator>, trivial_extract_key>;
|
|
159
|
-
|
|
160
|
-
// alias with default allocator for convenience
|
|
161
|
-
using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
|
|
162
|
-
|
|
163
|
-
template<
|
|
164
|
-
typename Summary,
|
|
165
|
-
typename IntersectionPolicy,
|
|
166
|
-
typename UnionPolicy = default_union_policy<Summary>,
|
|
167
|
-
typename Allocator = std::allocator<Summary>>
|
|
168
|
-
using tuple_jaccard_similarity = jaccard_similarity_base<tuple_union<Summary, UnionPolicy, Allocator>, tuple_intersection<Summary, IntersectionPolicy, Allocator>, pair_extract_key<uint64_t, Summary>>;
|
|
169
|
-
|
|
170
154
|
} /* namespace datasketches */
|
|
171
155
|
|
|
172
156
|
# endif
|
|
File without changes
|
|
@@ -17,6 +17,9 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
+
#ifndef THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
|
|
21
|
+
#define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
|
|
22
|
+
|
|
20
23
|
#include <algorithm>
|
|
21
24
|
|
|
22
25
|
#include "conditional_back_inserter.hpp"
|
|
@@ -78,3 +81,5 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
|
|
|
78
81
|
}
|
|
79
82
|
|
|
80
83
|
} /* namespace datasketches */
|
|
84
|
+
|
|
85
|
+
#endif
|
|
@@ -20,45 +20,29 @@
|
|
|
20
20
|
#ifndef THETA_SKETCH_HPP_
|
|
21
21
|
#define THETA_SKETCH_HPP_
|
|
22
22
|
|
|
23
|
-
#include
|
|
24
|
-
#include <functional>
|
|
25
|
-
#include <climits>
|
|
26
|
-
#include <vector>
|
|
27
|
-
|
|
28
|
-
#include "common_defs.hpp"
|
|
23
|
+
#include "theta_update_sketch_base.hpp"
|
|
29
24
|
|
|
30
25
|
namespace datasketches {
|
|
31
26
|
|
|
32
|
-
|
|
33
|
-
* author Alexander Saydakov
|
|
34
|
-
* author Lee Rhodes
|
|
35
|
-
* author Kevin Lang
|
|
36
|
-
*/
|
|
37
|
-
|
|
38
|
-
// forward-declarations
|
|
39
|
-
template<typename A> class theta_sketch_alloc;
|
|
40
|
-
template<typename A> class update_theta_sketch_alloc;
|
|
41
|
-
template<typename A> class compact_theta_sketch_alloc;
|
|
42
|
-
template<typename A> class theta_union_alloc;
|
|
43
|
-
template<typename A> class theta_intersection_alloc;
|
|
44
|
-
template<typename A> class theta_a_not_b_alloc;
|
|
45
|
-
|
|
46
|
-
// for serialization as raw bytes
|
|
47
|
-
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
48
|
-
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
|
49
|
-
|
|
50
|
-
template<typename A>
|
|
27
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
51
28
|
class theta_sketch_alloc {
|
|
52
29
|
public:
|
|
53
|
-
|
|
54
|
-
|
|
30
|
+
using Entry = uint64_t;
|
|
31
|
+
using ExtractKey = trivial_extract_key;
|
|
32
|
+
using iterator = theta_iterator<Entry, ExtractKey>;
|
|
33
|
+
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
|
55
34
|
|
|
56
35
|
virtual ~theta_sketch_alloc() = default;
|
|
57
36
|
|
|
37
|
+
/**
|
|
38
|
+
* @return allocator
|
|
39
|
+
*/
|
|
40
|
+
virtual Allocator get_allocator() const = 0;
|
|
41
|
+
|
|
58
42
|
/**
|
|
59
43
|
* @return true if this sketch represents an empty set (not the same as no retained entries!)
|
|
60
44
|
*/
|
|
61
|
-
bool is_empty() const;
|
|
45
|
+
virtual bool is_empty() const = 0;
|
|
62
46
|
|
|
63
47
|
/**
|
|
64
48
|
* @return estimate of the distinct count of the input stream
|
|
@@ -96,13 +80,16 @@ public:
|
|
|
96
80
|
/**
|
|
97
81
|
* @return theta as a positive integer between 0 and LLONG_MAX
|
|
98
82
|
*/
|
|
99
|
-
uint64_t get_theta64() const;
|
|
83
|
+
virtual uint64_t get_theta64() const = 0;
|
|
100
84
|
|
|
101
85
|
/**
|
|
102
86
|
* @return the number of retained entries in the sketch
|
|
103
87
|
*/
|
|
104
88
|
virtual uint32_t get_num_retained() const = 0;
|
|
105
89
|
|
|
90
|
+
/**
|
|
91
|
+
* @return hash of the seed that was used to hash the input
|
|
92
|
+
*/
|
|
106
93
|
virtual uint16_t get_seed_hash() const = 0;
|
|
107
94
|
|
|
108
95
|
/**
|
|
@@ -111,109 +98,82 @@ public:
|
|
|
111
98
|
virtual bool is_ordered() const = 0;
|
|
112
99
|
|
|
113
100
|
/**
|
|
114
|
-
*
|
|
101
|
+
* Provides a human-readable summary of this sketch as a string
|
|
115
102
|
* @param print_items if true include the list of items retained by the sketch
|
|
103
|
+
* @return sketch summary as a string
|
|
116
104
|
*/
|
|
117
|
-
virtual string<
|
|
118
|
-
|
|
119
|
-
/**
|
|
120
|
-
* This method serializes the sketch into a given stream in a binary form
|
|
121
|
-
* @param os output stream
|
|
122
|
-
*/
|
|
123
|
-
virtual void serialize(std::ostream& os) const = 0;
|
|
124
|
-
|
|
125
|
-
// This is a convenience alias for users
|
|
126
|
-
// The type returned by the following serialize method
|
|
127
|
-
typedef vector_u8<A> vector_bytes;
|
|
105
|
+
virtual string<Allocator> to_string(bool print_items = false) const;
|
|
128
106
|
|
|
129
107
|
/**
|
|
130
|
-
*
|
|
131
|
-
*
|
|
132
|
-
* It is an uninitialized space of a given size.
|
|
133
|
-
* This header is used in Datasketches PostgreSQL extension.
|
|
134
|
-
* @param header_size_bytes space to reserve in front of the sketch
|
|
135
|
-
*/
|
|
136
|
-
virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
|
|
137
|
-
|
|
138
|
-
// This is a convenience alias for users
|
|
139
|
-
// The type returned by the following deserialize methods
|
|
140
|
-
// It is not possible to return instances of an abstract type, so this has to be a pointer
|
|
141
|
-
typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
|
|
142
|
-
|
|
143
|
-
/**
|
|
144
|
-
* This method deserializes a sketch from a given stream.
|
|
145
|
-
* @param is input stream
|
|
146
|
-
* @param seed the seed for the hash function that was used to create the sketch
|
|
147
|
-
* @return an instance of a sketch as a unique_ptr
|
|
108
|
+
* Iterator over hash values in this sketch.
|
|
109
|
+
* @return begin iterator
|
|
148
110
|
*/
|
|
149
|
-
|
|
111
|
+
virtual iterator begin() = 0;
|
|
150
112
|
|
|
151
113
|
/**
|
|
152
|
-
*
|
|
153
|
-
*
|
|
154
|
-
* @
|
|
155
|
-
* @param seed the seed for the hash function that was used to create the sketch
|
|
156
|
-
* @return an instance of the sketch
|
|
114
|
+
* Iterator pointing past the valid range.
|
|
115
|
+
* Not to be incremented or dereferenced.
|
|
116
|
+
* @return end iterator
|
|
157
117
|
*/
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
class const_iterator;
|
|
118
|
+
virtual iterator end() = 0;
|
|
161
119
|
|
|
162
120
|
/**
|
|
163
|
-
*
|
|
121
|
+
* Const iterator over hash values in this sketch.
|
|
164
122
|
* @return begin iterator
|
|
165
123
|
*/
|
|
166
124
|
virtual const_iterator begin() const = 0;
|
|
167
125
|
|
|
168
126
|
/**
|
|
169
|
-
*
|
|
127
|
+
* Const iterator pointing past the valid range.
|
|
170
128
|
* Not to be incremented or dereferenced.
|
|
171
129
|
* @return end iterator
|
|
172
130
|
*/
|
|
173
131
|
virtual const_iterator end() const = 0;
|
|
174
132
|
|
|
175
133
|
protected:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
bool is_empty_;
|
|
179
|
-
uint64_t theta_;
|
|
180
|
-
|
|
181
|
-
theta_sketch_alloc(bool is_empty, uint64_t theta);
|
|
182
|
-
|
|
183
|
-
static uint16_t get_seed_hash(uint64_t seed);
|
|
184
|
-
|
|
185
|
-
static void check_sketch_type(uint8_t actual, uint8_t expected);
|
|
186
|
-
static void check_serial_version(uint8_t actual, uint8_t expected);
|
|
187
|
-
static void check_seed_hash(uint16_t actual, uint16_t expected);
|
|
188
|
-
|
|
189
|
-
friend theta_intersection_alloc<A>;
|
|
190
|
-
friend theta_a_not_b_alloc<A>;
|
|
134
|
+
using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
|
|
135
|
+
virtual void print_specifics(ostrstream& os) const = 0;
|
|
191
136
|
};
|
|
192
137
|
|
|
193
|
-
//
|
|
194
|
-
|
|
195
|
-
template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
|
|
196
|
-
template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
|
|
138
|
+
// forward declaration
|
|
139
|
+
template<typename A> class compact_theta_sketch_alloc;
|
|
197
140
|
|
|
198
|
-
template<typename
|
|
199
|
-
class update_theta_sketch_alloc: public theta_sketch_alloc<
|
|
141
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
142
|
+
class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
|
|
200
143
|
public:
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
144
|
+
using Base = theta_sketch_alloc<Allocator>;
|
|
145
|
+
using Entry = typename Base::Entry;
|
|
146
|
+
using ExtractKey = typename Base::ExtractKey;
|
|
147
|
+
using iterator = typename Base::iterator;
|
|
148
|
+
using const_iterator = typename Base::const_iterator;
|
|
149
|
+
using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
|
|
150
|
+
using resize_factor = typename theta_table::resize_factor;
|
|
204
151
|
|
|
205
152
|
// No constructor here. Use builder instead.
|
|
153
|
+
class builder;
|
|
206
154
|
|
|
155
|
+
update_theta_sketch_alloc(const update_theta_sketch_alloc&) = default;
|
|
156
|
+
update_theta_sketch_alloc(update_theta_sketch_alloc&&) noexcept = default;
|
|
207
157
|
virtual ~update_theta_sketch_alloc() = default;
|
|
158
|
+
update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc&) = default;
|
|
159
|
+
update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&&) = default;
|
|
208
160
|
|
|
209
|
-
virtual
|
|
210
|
-
virtual
|
|
161
|
+
virtual Allocator get_allocator() const;
|
|
162
|
+
virtual bool is_empty() const;
|
|
211
163
|
virtual bool is_ordered() const;
|
|
212
|
-
virtual
|
|
213
|
-
virtual
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
164
|
+
virtual uint16_t get_seed_hash() const;
|
|
165
|
+
virtual uint64_t get_theta64() const;
|
|
166
|
+
virtual uint32_t get_num_retained() const;
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* @return configured nominal number of entries in the sketch
|
|
170
|
+
*/
|
|
171
|
+
uint8_t get_lg_k() const;
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* @return configured resize factor of the sketch
|
|
175
|
+
*/
|
|
176
|
+
resize_factor get_rf() const;
|
|
217
177
|
|
|
218
178
|
/**
|
|
219
179
|
* Update this sketch with a given string.
|
|
@@ -302,7 +262,7 @@ public:
|
|
|
302
262
|
* @param data pointer to the data
|
|
303
263
|
* @param length of the data in bytes
|
|
304
264
|
*/
|
|
305
|
-
void update(const void* data,
|
|
265
|
+
void update(const void* data, size_t length);
|
|
306
266
|
|
|
307
267
|
/**
|
|
308
268
|
* Remove retained entries in excess of the nominal size k (if any)
|
|
@@ -314,105 +274,85 @@ public:
|
|
|
314
274
|
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
315
275
|
* @return compact sketch
|
|
316
276
|
*/
|
|
317
|
-
compact_theta_sketch_alloc<
|
|
318
|
-
|
|
319
|
-
virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
|
|
320
|
-
virtual typename theta_sketch_alloc<A>::const_iterator end() const;
|
|
321
|
-
|
|
322
|
-
/**
|
|
323
|
-
* This method deserializes a sketch from a given stream.
|
|
324
|
-
* @param is input stream
|
|
325
|
-
* @param seed the seed for the hash function that was used to create the sketch
|
|
326
|
-
* @return an instance of a sketch
|
|
327
|
-
*/
|
|
328
|
-
static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
|
|
277
|
+
compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
|
|
329
278
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
* @param seed the seed for the hash function that was used to create the sketch
|
|
335
|
-
* @return an instance of the sketch
|
|
336
|
-
*/
|
|
337
|
-
static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
|
|
279
|
+
virtual iterator begin();
|
|
280
|
+
virtual iterator end();
|
|
281
|
+
virtual const_iterator begin() const;
|
|
282
|
+
virtual const_iterator end() const;
|
|
338
283
|
|
|
339
284
|
private:
|
|
340
|
-
|
|
341
|
-
static constexpr double RESIZE_THRESHOLD = 0.5;
|
|
342
|
-
// hash table rebuild threshold = 15/16
|
|
343
|
-
static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
|
|
344
|
-
|
|
345
|
-
static constexpr uint8_t STRIDE_HASH_BITS = 7;
|
|
346
|
-
static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
|
|
347
|
-
|
|
348
|
-
uint8_t lg_cur_size_;
|
|
349
|
-
uint8_t lg_nom_size_;
|
|
350
|
-
vector_u64<A> keys_;
|
|
351
|
-
uint32_t num_keys_;
|
|
352
|
-
resize_factor rf_;
|
|
353
|
-
float p_;
|
|
354
|
-
uint64_t seed_;
|
|
355
|
-
uint32_t capacity_;
|
|
285
|
+
theta_table table_;
|
|
356
286
|
|
|
357
287
|
// for builder
|
|
358
|
-
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
359
|
-
|
|
360
|
-
// for deserialize
|
|
361
|
-
update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
|
|
288
|
+
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
|
|
289
|
+
uint64_t seed, const Allocator& allocator);
|
|
362
290
|
|
|
363
|
-
|
|
364
|
-
void
|
|
365
|
-
|
|
366
|
-
friend theta_union_alloc<A>;
|
|
367
|
-
void internal_update(uint64_t hash);
|
|
368
|
-
|
|
369
|
-
friend theta_intersection_alloc<A>;
|
|
370
|
-
friend theta_a_not_b_alloc<A>;
|
|
371
|
-
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
|
|
372
|
-
static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
|
|
373
|
-
static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
|
|
374
|
-
static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
|
|
375
|
-
|
|
376
|
-
friend theta_sketch_alloc<A>;
|
|
377
|
-
static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
|
|
378
|
-
static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
|
|
291
|
+
using ostrstream = typename Base::ostrstream;
|
|
292
|
+
virtual void print_specifics(ostrstream& os) const;
|
|
379
293
|
};
|
|
380
294
|
|
|
381
295
|
// compact sketch
|
|
382
296
|
|
|
383
|
-
template<typename
|
|
384
|
-
class compact_theta_sketch_alloc: public theta_sketch_alloc<
|
|
297
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
298
|
+
class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
|
|
385
299
|
public:
|
|
300
|
+
using Base = theta_sketch_alloc<Allocator>;
|
|
301
|
+
using iterator = typename Base::iterator;
|
|
302
|
+
using const_iterator = typename Base::const_iterator;
|
|
303
|
+
using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
|
|
304
|
+
using vector_bytes = std::vector<uint8_t, AllocBytes>;
|
|
305
|
+
|
|
306
|
+
static const uint8_t SERIAL_VERSION = 3;
|
|
386
307
|
static const uint8_t SKETCH_TYPE = 3;
|
|
387
308
|
|
|
388
|
-
// No constructor here.
|
|
389
309
|
// Instances of this type can be obtained:
|
|
390
|
-
// - by compacting an
|
|
310
|
+
// - by compacting an update_theta_sketch_alloc
|
|
391
311
|
// - as a result of a set operation
|
|
392
312
|
// - by deserializing a previously serialized compact sketch
|
|
393
313
|
|
|
394
|
-
compact_theta_sketch_alloc(const
|
|
314
|
+
compact_theta_sketch_alloc(const Base& other, bool ordered);
|
|
315
|
+
compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
|
|
316
|
+
compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
|
|
395
317
|
virtual ~compact_theta_sketch_alloc() = default;
|
|
318
|
+
compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc&) = default;
|
|
319
|
+
compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&&) = default;
|
|
396
320
|
|
|
321
|
+
virtual Allocator get_allocator() const;
|
|
322
|
+
virtual bool is_empty() const;
|
|
323
|
+
virtual bool is_ordered() const;
|
|
324
|
+
virtual uint64_t get_theta64() const;
|
|
397
325
|
virtual uint32_t get_num_retained() const;
|
|
398
326
|
virtual uint16_t get_seed_hash() const;
|
|
399
|
-
virtual bool is_ordered() const;
|
|
400
|
-
virtual string<A> to_string(bool print_items = false) const;
|
|
401
|
-
virtual void serialize(std::ostream& os) const;
|
|
402
|
-
typedef vector_u8<A> vector_bytes; // alias for users
|
|
403
|
-
// header space is reserved, but not initialized
|
|
404
|
-
virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
405
327
|
|
|
406
|
-
|
|
407
|
-
|
|
328
|
+
/**
|
|
329
|
+
* This method serializes the sketch into a given stream in a binary form
|
|
330
|
+
* @param os output stream
|
|
331
|
+
*/
|
|
332
|
+
void serialize(std::ostream& os) const;
|
|
333
|
+
|
|
334
|
+
/**
|
|
335
|
+
* This method serializes the sketch as a vector of bytes.
|
|
336
|
+
* An optional header can be reserved in front of the sketch.
|
|
337
|
+
* It is an uninitialized space of a given size.
|
|
338
|
+
* This header is used in Datasketches PostgreSQL extension.
|
|
339
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
|
340
|
+
*/
|
|
341
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
342
|
+
|
|
343
|
+
virtual iterator begin();
|
|
344
|
+
virtual iterator end();
|
|
345
|
+
virtual const_iterator begin() const;
|
|
346
|
+
virtual const_iterator end() const;
|
|
408
347
|
|
|
409
348
|
/**
|
|
410
349
|
* This method deserializes a sketch from a given stream.
|
|
411
350
|
* @param is input stream
|
|
412
351
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
413
|
-
* @return an instance of
|
|
352
|
+
* @return an instance of the sketch
|
|
414
353
|
*/
|
|
415
|
-
static compact_theta_sketch_alloc
|
|
354
|
+
static compact_theta_sketch_alloc deserialize(std::istream& is,
|
|
355
|
+
uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
416
356
|
|
|
417
357
|
/**
|
|
418
358
|
* This method deserializes a sketch from a given array of bytes.
|
|
@@ -421,110 +361,36 @@ public:
|
|
|
421
361
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
422
362
|
* @return an instance of the sketch
|
|
423
363
|
*/
|
|
424
|
-
static compact_theta_sketch_alloc
|
|
364
|
+
static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
|
|
365
|
+
uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
366
|
+
|
|
367
|
+
// for internal use
|
|
368
|
+
compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
|
|
425
369
|
|
|
426
370
|
private:
|
|
427
|
-
|
|
371
|
+
enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
|
|
428
372
|
|
|
429
|
-
|
|
430
|
-
uint16_t seed_hash_;
|
|
373
|
+
bool is_empty_;
|
|
431
374
|
bool is_ordered_;
|
|
375
|
+
uint16_t seed_hash_;
|
|
376
|
+
uint64_t theta_;
|
|
377
|
+
std::vector<uint64_t, Allocator> entries_;
|
|
432
378
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
friend theta_union_alloc<A>;
|
|
436
|
-
friend theta_intersection_alloc<A>;
|
|
437
|
-
friend theta_a_not_b_alloc<A>;
|
|
438
|
-
compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
|
|
439
|
-
static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
|
|
440
|
-
static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
|
|
441
|
-
};
|
|
442
|
-
|
|
443
|
-
// builder
|
|
444
|
-
|
|
445
|
-
template<typename A>
|
|
446
|
-
class update_theta_sketch_alloc<A>::builder {
|
|
447
|
-
public:
|
|
448
|
-
static const uint8_t MIN_LG_K = 5;
|
|
449
|
-
static const uint8_t DEFAULT_LG_K = 12;
|
|
450
|
-
static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
|
|
451
|
-
|
|
452
|
-
/**
|
|
453
|
-
* Creates and instance of the builder with default parameters.
|
|
454
|
-
*/
|
|
455
|
-
builder();
|
|
456
|
-
|
|
457
|
-
/**
|
|
458
|
-
* Set log2(k), where k is a nominal number of entries in the sketch
|
|
459
|
-
* @param lg_k base 2 logarithm of nominal number of entries
|
|
460
|
-
* @return this builder
|
|
461
|
-
*/
|
|
462
|
-
builder& set_lg_k(uint8_t lg_k);
|
|
463
|
-
|
|
464
|
-
/**
|
|
465
|
-
* Set resize factor for the internal hash table (defaults to 8)
|
|
466
|
-
* @param rf resize factor
|
|
467
|
-
* @return this builder
|
|
468
|
-
*/
|
|
469
|
-
builder& set_resize_factor(resize_factor rf);
|
|
470
|
-
|
|
471
|
-
/**
|
|
472
|
-
* Set sampling probability (initial theta). The default is 1, so the sketch retains
|
|
473
|
-
* all entries until it reaches the limit, at which point it goes into the estimation mode
|
|
474
|
-
* and reduces the effective sampling probability (theta) as necessary.
|
|
475
|
-
* @param p sampling probability
|
|
476
|
-
* @return this builder
|
|
477
|
-
*/
|
|
478
|
-
builder& set_p(float p);
|
|
479
|
-
|
|
480
|
-
/**
|
|
481
|
-
* Set the seed for the hash function. Should be used carefully if needed.
|
|
482
|
-
* Sketches produced with different seed are not compatible
|
|
483
|
-
* and cannot be mixed in set operations.
|
|
484
|
-
* @param seed hash seed
|
|
485
|
-
* @return this builder
|
|
486
|
-
*/
|
|
487
|
-
builder& set_seed(uint64_t seed);
|
|
488
|
-
|
|
489
|
-
/**
|
|
490
|
-
* This is to create an instance of the sketch with predefined parameters.
|
|
491
|
-
* @return and instance of the sketch
|
|
492
|
-
*/
|
|
493
|
-
update_theta_sketch_alloc<A> build() const;
|
|
494
|
-
|
|
495
|
-
private:
|
|
496
|
-
uint8_t lg_k_;
|
|
497
|
-
resize_factor rf_;
|
|
498
|
-
float p_;
|
|
499
|
-
uint64_t seed_;
|
|
500
|
-
|
|
501
|
-
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
|
|
379
|
+
using ostrstream = typename Base::ostrstream;
|
|
380
|
+
virtual void print_specifics(ostrstream& os) const;
|
|
502
381
|
};
|
|
503
382
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
|
|
383
|
+
template<typename Allocator>
|
|
384
|
+
class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
|
|
507
385
|
public:
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
bool operator==(const const_iterator& other) const;
|
|
511
|
-
bool operator!=(const const_iterator& other) const;
|
|
512
|
-
uint64_t operator*() const;
|
|
513
|
-
|
|
514
|
-
private:
|
|
515
|
-
const uint64_t* keys_;
|
|
516
|
-
uint32_t size_;
|
|
517
|
-
uint32_t index_;
|
|
518
|
-
const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
|
|
519
|
-
friend class update_theta_sketch_alloc<A>;
|
|
520
|
-
friend class compact_theta_sketch_alloc<A>;
|
|
386
|
+
builder(const Allocator& allocator = Allocator());
|
|
387
|
+
update_theta_sketch_alloc build() const;
|
|
521
388
|
};
|
|
522
389
|
|
|
523
|
-
|
|
524
390
|
// aliases with default allocator for convenience
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
391
|
+
using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
392
|
+
using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
393
|
+
using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
528
394
|
|
|
529
395
|
} /* namespace datasketches */
|
|
530
396
|
|