datasketches 0.2.0 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -49,6 +49,21 @@ public:
|
|
49
49
|
}
|
50
50
|
};
|
51
51
|
|
52
|
+
template<bool dummy>
|
53
|
+
class theta_build_helper{
|
54
|
+
public:
|
55
|
+
// consistent way of initializing theta from p
|
56
|
+
// avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
|
57
|
+
static uint64_t starting_theta_from_p(float p) {
|
58
|
+
if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
|
59
|
+
return theta_constants::MAX_THETA;
|
60
|
+
}
|
61
|
+
|
62
|
+
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
63
|
+
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
64
|
+
}
|
65
|
+
};
|
66
|
+
|
52
67
|
} /* namespace datasketches */
|
53
68
|
|
54
69
|
#endif
|
@@ -33,14 +33,19 @@ public:
|
|
33
33
|
using Sketch = theta_sketch_alloc<Allocator>;
|
34
34
|
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
35
35
|
|
36
|
-
struct
|
37
|
-
|
36
|
+
struct nop_policy {
|
37
|
+
void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
|
38
38
|
unused(incoming_entry);
|
39
|
-
|
39
|
+
unused(internal_entry);
|
40
40
|
}
|
41
41
|
};
|
42
|
-
using State = theta_intersection_base<Entry, ExtractKey,
|
42
|
+
using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
|
43
43
|
|
44
|
+
/*
|
45
|
+
* Constructor
|
46
|
+
* @param seed for the hash function that was used to create the sketch
|
47
|
+
* @param allocator to use for allocating and deallocating memory
|
48
|
+
*/
|
44
49
|
explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
45
50
|
|
46
51
|
/**
|
@@ -29,7 +29,7 @@ template<typename EN, typename EK, typename P, typename S, typename CS, typename
|
|
29
29
|
theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
|
30
30
|
policy_(policy),
|
31
31
|
is_valid_(false),
|
32
|
-
table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
|
32
|
+
table_(0, 0, resize_factor::X1, 1, theta_constants::MAX_THETA, seed, allocator, false)
|
33
33
|
{}
|
34
34
|
|
35
35
|
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
@@ -38,17 +38,17 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
38
38
|
if (table_.is_empty_) return;
|
39
39
|
if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
|
40
40
|
table_.is_empty_ |= sketch.is_empty();
|
41
|
-
table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
|
41
|
+
table_.theta_ = table_.is_empty_ ? theta_constants::MAX_THETA : std::min(table_.theta_, sketch.get_theta64());
|
42
42
|
if (is_valid_ && table_.num_entries_ == 0) return;
|
43
43
|
if (sketch.get_num_retained() == 0) {
|
44
44
|
is_valid_ = true;
|
45
|
-
table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
45
|
+
table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
46
46
|
return;
|
47
47
|
}
|
48
48
|
if (!is_valid_) { // first update, copy or move incoming sketch
|
49
49
|
is_valid_ = true;
|
50
50
|
const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
51
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
51
|
+
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
52
52
|
for (auto& entry: sketch) {
|
53
53
|
auto result = table_.find(EK()(entry));
|
54
54
|
if (result.second) {
|
@@ -83,11 +83,11 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
83
83
|
throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
|
84
84
|
}
|
85
85
|
if (match_count == 0) {
|
86
|
-
table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
86
|
+
table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
87
87
|
if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
|
88
88
|
} else {
|
89
89
|
const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
90
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
90
|
+
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
91
91
|
for (uint32_t i = 0; i < match_count; i++) {
|
92
92
|
auto result = table_.find(EK()(matched_entries[i]));
|
93
93
|
table_.insert(result.first, std::move(matched_entries[i]));
|
@@ -24,7 +24,7 @@ namespace datasketches {
|
|
24
24
|
|
25
25
|
template<typename A>
|
26
26
|
theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
|
27
|
-
state_(seed,
|
27
|
+
state_(seed, nop_policy(), allocator)
|
28
28
|
{}
|
29
29
|
|
30
30
|
template<typename A>
|
@@ -46,20 +46,21 @@ public:
|
|
46
46
|
*
|
47
47
|
* @param sketch_a given sketch A
|
48
48
|
* @param sketch_b given sketch B
|
49
|
+
* @param seed for the hash function that was used to create the sketch
|
49
50
|
* @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
|
50
51
|
* The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
|
51
52
|
*/
|
52
53
|
template<typename SketchA, typename SketchB>
|
53
|
-
static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
|
54
|
+
static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
|
54
55
|
if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
|
55
56
|
if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
|
56
57
|
if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
|
57
58
|
|
58
|
-
auto union_ab = compute_union(sketch_a, sketch_b);
|
59
|
+
auto union_ab = compute_union(sketch_a, sketch_b, seed);
|
59
60
|
if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
|
60
61
|
|
61
62
|
// intersection
|
62
|
-
Intersection i;
|
63
|
+
Intersection i(seed);
|
63
64
|
i.update(sketch_a);
|
64
65
|
i.update(sketch_b);
|
65
66
|
i.update(union_ab); // ensures that intersection is a subset of the union
|
@@ -76,15 +77,16 @@ public:
|
|
76
77
|
* Returns true if the two given sketches are equivalent.
|
77
78
|
* @param sketch_a the given sketch A
|
78
79
|
* @param sketch_b the given sketch B
|
80
|
+
* @param seed for the hash function that was used to create the sketch
|
79
81
|
* @return true if the two given sketches are exactly equal
|
80
82
|
*/
|
81
83
|
template<typename SketchA, typename SketchB>
|
82
|
-
static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
|
84
|
+
static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
|
83
85
|
if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
|
84
86
|
if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
|
85
87
|
if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
|
86
88
|
|
87
|
-
auto union_ab = compute_union(sketch_a, sketch_b);
|
89
|
+
auto union_ab = compute_union(sketch_a, sketch_b, seed);
|
88
90
|
if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
|
89
91
|
return false;
|
90
92
|
}
|
@@ -99,12 +101,13 @@ public:
|
|
99
101
|
* @param actual the sketch to be tested
|
100
102
|
* @param expected the reference sketch that is considered to be correct
|
101
103
|
* @param threshold a real value between zero and one
|
104
|
+
* @param seed for the hash function that was used to create the sketch
|
102
105
|
* @return true if the similarity of the two sketches is greater than the given threshold
|
103
106
|
* with at least 97.7% confidence
|
104
107
|
*/
|
105
108
|
template<typename SketchA, typename SketchB>
|
106
|
-
static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
|
107
|
-
auto jc = jaccard(actual, expected);
|
109
|
+
static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
|
110
|
+
auto jc = jaccard(actual, expected, seed);
|
108
111
|
return jc[0] >= threshold;
|
109
112
|
}
|
110
113
|
|
@@ -118,23 +121,24 @@ public:
|
|
118
121
|
* @param actual the sketch to be tested
|
119
122
|
* @param expected the reference sketch that is considered to be correct
|
120
123
|
* @param threshold a real value between zero and one
|
124
|
+
* @param seed for the hash function that was used to create the sketch
|
121
125
|
* @return true if the dissimilarity of the two sketches is greater than the given threshold
|
122
126
|
* with at least 97.7% confidence
|
123
127
|
*/
|
124
128
|
template<typename SketchA, typename SketchB>
|
125
|
-
static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
|
126
|
-
auto jc = jaccard(actual, expected);
|
129
|
+
static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
|
130
|
+
auto jc = jaccard(actual, expected, seed);
|
127
131
|
return jc[2] <= threshold;
|
128
132
|
}
|
129
133
|
|
130
134
|
private:
|
131
135
|
|
132
136
|
template<typename SketchA, typename SketchB>
|
133
|
-
static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
|
134
|
-
const
|
135
|
-
const
|
136
|
-
const
|
137
|
-
auto u = typename Union::builder().set_lg_k(lg_k).build();
|
137
|
+
static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
|
138
|
+
const auto count_a = sketch_a.get_num_retained();
|
139
|
+
const auto count_b = sketch_b.get_num_retained();
|
140
|
+
const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
|
141
|
+
auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
|
138
142
|
u.update(sketch_a);
|
139
143
|
u.update(sketch_b);
|
140
144
|
return u.get_result(false);
|
@@ -36,7 +36,7 @@ seed_hash_(compute_seed_hash(seed))
|
|
36
36
|
template<typename EN, typename EK, typename CS, typename A>
|
37
37
|
template<typename FwdSketch, typename Sketch>
|
38
38
|
CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
|
39
|
-
if (a.is_empty() || a.get_num_retained()
|
39
|
+
if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
|
40
40
|
if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
|
41
41
|
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
|
42
42
|
|
@@ -53,7 +53,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
|
|
53
53
|
conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
|
54
54
|
} else { // hash-based
|
55
55
|
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
|
56
|
-
hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
|
56
|
+
hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
|
57
57
|
for (const auto& entry: b) {
|
58
58
|
const uint64_t hash = EK()(entry);
|
59
59
|
if (hash < theta) {
|
@@ -25,14 +25,10 @@
|
|
25
25
|
namespace datasketches {
|
26
26
|
|
27
27
|
template<typename Allocator = std::allocator<uint64_t>>
|
28
|
-
class
|
28
|
+
class base_theta_sketch_alloc {
|
29
29
|
public:
|
30
|
-
using Entry = uint64_t;
|
31
|
-
using ExtractKey = trivial_extract_key;
|
32
|
-
using iterator = theta_iterator<Entry, ExtractKey>;
|
33
|
-
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
34
30
|
|
35
|
-
virtual ~
|
31
|
+
virtual ~base_theta_sketch_alloc() = default;
|
36
32
|
|
37
33
|
/**
|
38
34
|
* @return allocator
|
@@ -104,6 +100,21 @@ public:
|
|
104
100
|
*/
|
105
101
|
virtual string<Allocator> to_string(bool print_items = false) const;
|
106
102
|
|
103
|
+
protected:
|
104
|
+
virtual void print_specifics(std::ostringstream& os) const = 0;
|
105
|
+
virtual void print_items(std::ostringstream& os) const = 0;
|
106
|
+
};
|
107
|
+
|
108
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
109
|
+
class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
|
110
|
+
public:
|
111
|
+
using Entry = uint64_t;
|
112
|
+
using ExtractKey = trivial_extract_key;
|
113
|
+
using iterator = theta_iterator<Entry, ExtractKey>;
|
114
|
+
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
115
|
+
|
116
|
+
virtual ~theta_sketch_alloc() = default;
|
117
|
+
|
107
118
|
/**
|
108
119
|
* Iterator over hash values in this sketch.
|
109
120
|
* @return begin iterator
|
@@ -131,8 +142,7 @@ public:
|
|
131
142
|
virtual const_iterator end() const = 0;
|
132
143
|
|
133
144
|
protected:
|
134
|
-
|
135
|
-
virtual void print_specifics(ostrstream& os) const = 0;
|
145
|
+
virtual void print_items(std::ostringstream& os) const;
|
136
146
|
};
|
137
147
|
|
138
148
|
// forward declaration
|
@@ -269,6 +279,11 @@ public:
|
|
269
279
|
*/
|
270
280
|
void trim();
|
271
281
|
|
282
|
+
/**
|
283
|
+
* Reset the sketch to the initial empty state
|
284
|
+
*/
|
285
|
+
void reset();
|
286
|
+
|
272
287
|
/**
|
273
288
|
* Converts this sketch to a compact sketch (ordered or unordered).
|
274
289
|
* @param ordered optional flag to specify if ordered sketch should be produced
|
@@ -285,11 +300,10 @@ private:
|
|
285
300
|
theta_table table_;
|
286
301
|
|
287
302
|
// for builder
|
288
|
-
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
289
|
-
uint64_t seed, const Allocator& allocator);
|
303
|
+
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
|
304
|
+
uint64_t theta, uint64_t seed, const Allocator& allocator);
|
290
305
|
|
291
|
-
|
292
|
-
virtual void print_specifics(ostrstream& os) const;
|
306
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
293
307
|
};
|
294
308
|
|
295
309
|
// compact sketch
|
@@ -311,7 +325,8 @@ public:
|
|
311
325
|
// - as a result of a set operation
|
312
326
|
// - by deserializing a previously serialized compact sketch
|
313
327
|
|
314
|
-
|
328
|
+
template<typename Other>
|
329
|
+
compact_theta_sketch_alloc(const Other& other, bool ordered);
|
315
330
|
compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
|
316
331
|
compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
|
317
332
|
virtual ~compact_theta_sketch_alloc() = default;
|
@@ -376,8 +391,7 @@ private:
|
|
376
391
|
uint64_t theta_;
|
377
392
|
std::vector<uint64_t, Allocator> entries_;
|
378
393
|
|
379
|
-
|
380
|
-
virtual void print_specifics(ostrstream& os) const;
|
394
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
381
395
|
};
|
382
396
|
|
383
397
|
template<typename Allocator>
|
@@ -387,10 +401,54 @@ public:
|
|
387
401
|
update_theta_sketch_alloc build() const;
|
388
402
|
};
|
389
403
|
|
404
|
+
// This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
|
405
|
+
// It does not take the ownership of the buffer.
|
406
|
+
|
407
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
408
|
+
class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
|
409
|
+
public:
|
410
|
+
using const_iterator = const uint64_t*;
|
411
|
+
|
412
|
+
Allocator get_allocator() const;
|
413
|
+
bool is_empty() const;
|
414
|
+
bool is_ordered() const;
|
415
|
+
uint64_t get_theta64() const;
|
416
|
+
uint32_t get_num_retained() const;
|
417
|
+
uint16_t get_seed_hash() const;
|
418
|
+
|
419
|
+
const_iterator begin() const;
|
420
|
+
const_iterator end() const;
|
421
|
+
|
422
|
+
/**
|
423
|
+
* This method wraps a serialized compact sketch as an array of bytes.
|
424
|
+
* @param bytes pointer to the array of bytes
|
425
|
+
* @param size the size of the array
|
426
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
427
|
+
* @return an instance of the sketch
|
428
|
+
*/
|
429
|
+
static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
|
430
|
+
|
431
|
+
protected:
|
432
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
433
|
+
virtual void print_items(std::ostringstream& os) const;
|
434
|
+
|
435
|
+
private:
|
436
|
+
bool is_empty_;
|
437
|
+
bool is_ordered_;
|
438
|
+
uint16_t seed_hash_;
|
439
|
+
uint32_t num_entries_;
|
440
|
+
uint64_t theta_;
|
441
|
+
const uint64_t* entries_;
|
442
|
+
|
443
|
+
wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
|
444
|
+
uint64_t theta, const uint64_t* entries);
|
445
|
+
};
|
446
|
+
|
390
447
|
// aliases with default allocator for convenience
|
391
448
|
using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
|
392
449
|
using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
|
393
450
|
using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
451
|
+
using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
394
452
|
|
395
453
|
} /* namespace datasketches */
|
396
454
|
|