datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -49,6 +49,21 @@ public:
|
|
|
49
49
|
}
|
|
50
50
|
};
|
|
51
51
|
|
|
52
|
+
template<bool dummy>
|
|
53
|
+
class theta_build_helper{
|
|
54
|
+
public:
|
|
55
|
+
// consistent way of initializing theta from p
|
|
56
|
+
// avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
|
|
57
|
+
static uint64_t starting_theta_from_p(float p) {
|
|
58
|
+
if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
|
|
59
|
+
return theta_constants::MAX_THETA;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
|
63
|
+
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
|
|
52
67
|
} /* namespace datasketches */
|
|
53
68
|
|
|
54
69
|
#endif
|
|
@@ -33,14 +33,19 @@ public:
|
|
|
33
33
|
using Sketch = theta_sketch_alloc<Allocator>;
|
|
34
34
|
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
|
35
35
|
|
|
36
|
-
struct
|
|
37
|
-
|
|
36
|
+
struct nop_policy {
|
|
37
|
+
void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
|
|
38
38
|
unused(incoming_entry);
|
|
39
|
-
|
|
39
|
+
unused(internal_entry);
|
|
40
40
|
}
|
|
41
41
|
};
|
|
42
|
-
using State = theta_intersection_base<Entry, ExtractKey,
|
|
42
|
+
using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
|
|
43
43
|
|
|
44
|
+
/*
|
|
45
|
+
* Constructor
|
|
46
|
+
* @param seed for the hash function that was used to create the sketch
|
|
47
|
+
* @param allocator to use for allocating and deallocating memory
|
|
48
|
+
*/
|
|
44
49
|
explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
|
|
45
50
|
|
|
46
51
|
/**
|
|
@@ -29,7 +29,7 @@ template<typename EN, typename EK, typename P, typename S, typename CS, typename
|
|
|
29
29
|
theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
|
|
30
30
|
policy_(policy),
|
|
31
31
|
is_valid_(false),
|
|
32
|
-
table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
|
|
32
|
+
table_(0, 0, resize_factor::X1, 1, theta_constants::MAX_THETA, seed, allocator, false)
|
|
33
33
|
{}
|
|
34
34
|
|
|
35
35
|
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
|
@@ -38,17 +38,17 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
38
38
|
if (table_.is_empty_) return;
|
|
39
39
|
if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
|
|
40
40
|
table_.is_empty_ |= sketch.is_empty();
|
|
41
|
-
table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
|
|
41
|
+
table_.theta_ = table_.is_empty_ ? theta_constants::MAX_THETA : std::min(table_.theta_, sketch.get_theta64());
|
|
42
42
|
if (is_valid_ && table_.num_entries_ == 0) return;
|
|
43
43
|
if (sketch.get_num_retained() == 0) {
|
|
44
44
|
is_valid_ = true;
|
|
45
|
-
table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
45
|
+
table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
46
46
|
return;
|
|
47
47
|
}
|
|
48
48
|
if (!is_valid_) { // first update, copy or move incoming sketch
|
|
49
49
|
is_valid_ = true;
|
|
50
50
|
const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
|
51
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
51
|
+
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
52
52
|
for (auto& entry: sketch) {
|
|
53
53
|
auto result = table_.find(EK()(entry));
|
|
54
54
|
if (result.second) {
|
|
@@ -83,11 +83,11 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
83
83
|
throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
|
|
84
84
|
}
|
|
85
85
|
if (match_count == 0) {
|
|
86
|
-
table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
86
|
+
table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
87
87
|
if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
|
|
88
88
|
} else {
|
|
89
89
|
const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
|
90
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
90
|
+
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
91
91
|
for (uint32_t i = 0; i < match_count; i++) {
|
|
92
92
|
auto result = table_.find(EK()(matched_entries[i]));
|
|
93
93
|
table_.insert(result.first, std::move(matched_entries[i]));
|
|
@@ -24,7 +24,7 @@ namespace datasketches {
|
|
|
24
24
|
|
|
25
25
|
template<typename A>
|
|
26
26
|
theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
|
|
27
|
-
state_(seed,
|
|
27
|
+
state_(seed, nop_policy(), allocator)
|
|
28
28
|
{}
|
|
29
29
|
|
|
30
30
|
template<typename A>
|
|
@@ -46,20 +46,21 @@ public:
|
|
|
46
46
|
*
|
|
47
47
|
* @param sketch_a given sketch A
|
|
48
48
|
* @param sketch_b given sketch B
|
|
49
|
+
* @param seed for the hash function that was used to create the sketch
|
|
49
50
|
* @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
|
|
50
51
|
* The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
|
|
51
52
|
*/
|
|
52
53
|
template<typename SketchA, typename SketchB>
|
|
53
|
-
static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
|
|
54
|
+
static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
|
|
54
55
|
if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
|
|
55
56
|
if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
|
|
56
57
|
if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
|
|
57
58
|
|
|
58
|
-
auto union_ab = compute_union(sketch_a, sketch_b);
|
|
59
|
+
auto union_ab = compute_union(sketch_a, sketch_b, seed);
|
|
59
60
|
if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
|
|
60
61
|
|
|
61
62
|
// intersection
|
|
62
|
-
Intersection i;
|
|
63
|
+
Intersection i(seed);
|
|
63
64
|
i.update(sketch_a);
|
|
64
65
|
i.update(sketch_b);
|
|
65
66
|
i.update(union_ab); // ensures that intersection is a subset of the union
|
|
@@ -76,15 +77,16 @@ public:
|
|
|
76
77
|
* Returns true if the two given sketches are equivalent.
|
|
77
78
|
* @param sketch_a the given sketch A
|
|
78
79
|
* @param sketch_b the given sketch B
|
|
80
|
+
* @param seed for the hash function that was used to create the sketch
|
|
79
81
|
* @return true if the two given sketches are exactly equal
|
|
80
82
|
*/
|
|
81
83
|
template<typename SketchA, typename SketchB>
|
|
82
|
-
static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
|
|
84
|
+
static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
|
|
83
85
|
if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
|
|
84
86
|
if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
|
|
85
87
|
if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
|
|
86
88
|
|
|
87
|
-
auto union_ab = compute_union(sketch_a, sketch_b);
|
|
89
|
+
auto union_ab = compute_union(sketch_a, sketch_b, seed);
|
|
88
90
|
if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
|
|
89
91
|
return false;
|
|
90
92
|
}
|
|
@@ -99,12 +101,13 @@ public:
|
|
|
99
101
|
* @param actual the sketch to be tested
|
|
100
102
|
* @param expected the reference sketch that is considered to be correct
|
|
101
103
|
* @param threshold a real value between zero and one
|
|
104
|
+
* @param seed for the hash function that was used to create the sketch
|
|
102
105
|
* @return true if the similarity of the two sketches is greater than the given threshold
|
|
103
106
|
* with at least 97.7% confidence
|
|
104
107
|
*/
|
|
105
108
|
template<typename SketchA, typename SketchB>
|
|
106
|
-
static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
|
|
107
|
-
auto jc = jaccard(actual, expected);
|
|
109
|
+
static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
|
|
110
|
+
auto jc = jaccard(actual, expected, seed);
|
|
108
111
|
return jc[0] >= threshold;
|
|
109
112
|
}
|
|
110
113
|
|
|
@@ -118,23 +121,24 @@ public:
|
|
|
118
121
|
* @param actual the sketch to be tested
|
|
119
122
|
* @param expected the reference sketch that is considered to be correct
|
|
120
123
|
* @param threshold a real value between zero and one
|
|
124
|
+
* @param seed for the hash function that was used to create the sketch
|
|
121
125
|
* @return true if the dissimilarity of the two sketches is greater than the given threshold
|
|
122
126
|
* with at least 97.7% confidence
|
|
123
127
|
*/
|
|
124
128
|
template<typename SketchA, typename SketchB>
|
|
125
|
-
static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
|
|
126
|
-
auto jc = jaccard(actual, expected);
|
|
129
|
+
static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
|
|
130
|
+
auto jc = jaccard(actual, expected, seed);
|
|
127
131
|
return jc[2] <= threshold;
|
|
128
132
|
}
|
|
129
133
|
|
|
130
134
|
private:
|
|
131
135
|
|
|
132
136
|
template<typename SketchA, typename SketchB>
|
|
133
|
-
static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
|
|
134
|
-
const
|
|
135
|
-
const
|
|
136
|
-
const
|
|
137
|
-
auto u = typename Union::builder().set_lg_k(lg_k).build();
|
|
137
|
+
static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
|
|
138
|
+
const auto count_a = sketch_a.get_num_retained();
|
|
139
|
+
const auto count_b = sketch_b.get_num_retained();
|
|
140
|
+
const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
|
|
141
|
+
auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
|
|
138
142
|
u.update(sketch_a);
|
|
139
143
|
u.update(sketch_b);
|
|
140
144
|
return u.get_result(false);
|
|
@@ -36,7 +36,7 @@ seed_hash_(compute_seed_hash(seed))
|
|
|
36
36
|
template<typename EN, typename EK, typename CS, typename A>
|
|
37
37
|
template<typename FwdSketch, typename Sketch>
|
|
38
38
|
CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
|
|
39
|
-
if (a.is_empty() || a.get_num_retained()
|
|
39
|
+
if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
|
|
40
40
|
if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
|
|
41
41
|
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
|
|
42
42
|
|
|
@@ -53,7 +53,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
|
|
|
53
53
|
conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
|
|
54
54
|
} else { // hash-based
|
|
55
55
|
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
|
|
56
|
-
hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
|
|
56
|
+
hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
|
|
57
57
|
for (const auto& entry: b) {
|
|
58
58
|
const uint64_t hash = EK()(entry);
|
|
59
59
|
if (hash < theta) {
|
|
@@ -25,14 +25,10 @@
|
|
|
25
25
|
namespace datasketches {
|
|
26
26
|
|
|
27
27
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
28
|
-
class
|
|
28
|
+
class base_theta_sketch_alloc {
|
|
29
29
|
public:
|
|
30
|
-
using Entry = uint64_t;
|
|
31
|
-
using ExtractKey = trivial_extract_key;
|
|
32
|
-
using iterator = theta_iterator<Entry, ExtractKey>;
|
|
33
|
-
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
|
34
30
|
|
|
35
|
-
virtual ~
|
|
31
|
+
virtual ~base_theta_sketch_alloc() = default;
|
|
36
32
|
|
|
37
33
|
/**
|
|
38
34
|
* @return allocator
|
|
@@ -104,6 +100,21 @@ public:
|
|
|
104
100
|
*/
|
|
105
101
|
virtual string<Allocator> to_string(bool print_items = false) const;
|
|
106
102
|
|
|
103
|
+
protected:
|
|
104
|
+
virtual void print_specifics(std::ostringstream& os) const = 0;
|
|
105
|
+
virtual void print_items(std::ostringstream& os) const = 0;
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
109
|
+
class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
|
|
110
|
+
public:
|
|
111
|
+
using Entry = uint64_t;
|
|
112
|
+
using ExtractKey = trivial_extract_key;
|
|
113
|
+
using iterator = theta_iterator<Entry, ExtractKey>;
|
|
114
|
+
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
|
115
|
+
|
|
116
|
+
virtual ~theta_sketch_alloc() = default;
|
|
117
|
+
|
|
107
118
|
/**
|
|
108
119
|
* Iterator over hash values in this sketch.
|
|
109
120
|
* @return begin iterator
|
|
@@ -131,8 +142,7 @@ public:
|
|
|
131
142
|
virtual const_iterator end() const = 0;
|
|
132
143
|
|
|
133
144
|
protected:
|
|
134
|
-
|
|
135
|
-
virtual void print_specifics(ostrstream& os) const = 0;
|
|
145
|
+
virtual void print_items(std::ostringstream& os) const;
|
|
136
146
|
};
|
|
137
147
|
|
|
138
148
|
// forward declaration
|
|
@@ -269,6 +279,11 @@ public:
|
|
|
269
279
|
*/
|
|
270
280
|
void trim();
|
|
271
281
|
|
|
282
|
+
/**
|
|
283
|
+
* Reset the sketch to the initial empty state
|
|
284
|
+
*/
|
|
285
|
+
void reset();
|
|
286
|
+
|
|
272
287
|
/**
|
|
273
288
|
* Converts this sketch to a compact sketch (ordered or unordered).
|
|
274
289
|
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
@@ -285,11 +300,10 @@ private:
|
|
|
285
300
|
theta_table table_;
|
|
286
301
|
|
|
287
302
|
// for builder
|
|
288
|
-
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
289
|
-
uint64_t seed, const Allocator& allocator);
|
|
303
|
+
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
|
|
304
|
+
uint64_t theta, uint64_t seed, const Allocator& allocator);
|
|
290
305
|
|
|
291
|
-
|
|
292
|
-
virtual void print_specifics(ostrstream& os) const;
|
|
306
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
|
293
307
|
};
|
|
294
308
|
|
|
295
309
|
// compact sketch
|
|
@@ -311,7 +325,8 @@ public:
|
|
|
311
325
|
// - as a result of a set operation
|
|
312
326
|
// - by deserializing a previously serialized compact sketch
|
|
313
327
|
|
|
314
|
-
|
|
328
|
+
template<typename Other>
|
|
329
|
+
compact_theta_sketch_alloc(const Other& other, bool ordered);
|
|
315
330
|
compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
|
|
316
331
|
compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
|
|
317
332
|
virtual ~compact_theta_sketch_alloc() = default;
|
|
@@ -376,8 +391,7 @@ private:
|
|
|
376
391
|
uint64_t theta_;
|
|
377
392
|
std::vector<uint64_t, Allocator> entries_;
|
|
378
393
|
|
|
379
|
-
|
|
380
|
-
virtual void print_specifics(ostrstream& os) const;
|
|
394
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
|
381
395
|
};
|
|
382
396
|
|
|
383
397
|
template<typename Allocator>
|
|
@@ -387,10 +401,54 @@ public:
|
|
|
387
401
|
update_theta_sketch_alloc build() const;
|
|
388
402
|
};
|
|
389
403
|
|
|
404
|
+
// This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
|
|
405
|
+
// It does not take the ownership of the buffer.
|
|
406
|
+
|
|
407
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
408
|
+
class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
|
|
409
|
+
public:
|
|
410
|
+
using const_iterator = const uint64_t*;
|
|
411
|
+
|
|
412
|
+
Allocator get_allocator() const;
|
|
413
|
+
bool is_empty() const;
|
|
414
|
+
bool is_ordered() const;
|
|
415
|
+
uint64_t get_theta64() const;
|
|
416
|
+
uint32_t get_num_retained() const;
|
|
417
|
+
uint16_t get_seed_hash() const;
|
|
418
|
+
|
|
419
|
+
const_iterator begin() const;
|
|
420
|
+
const_iterator end() const;
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* This method wraps a serialized compact sketch as an array of bytes.
|
|
424
|
+
* @param bytes pointer to the array of bytes
|
|
425
|
+
* @param size the size of the array
|
|
426
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
427
|
+
* @return an instance of the sketch
|
|
428
|
+
*/
|
|
429
|
+
static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
|
|
430
|
+
|
|
431
|
+
protected:
|
|
432
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
|
433
|
+
virtual void print_items(std::ostringstream& os) const;
|
|
434
|
+
|
|
435
|
+
private:
|
|
436
|
+
bool is_empty_;
|
|
437
|
+
bool is_ordered_;
|
|
438
|
+
uint16_t seed_hash_;
|
|
439
|
+
uint32_t num_entries_;
|
|
440
|
+
uint64_t theta_;
|
|
441
|
+
const uint64_t* entries_;
|
|
442
|
+
|
|
443
|
+
wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
|
|
444
|
+
uint64_t theta, const uint64_t* entries);
|
|
445
|
+
};
|
|
446
|
+
|
|
390
447
|
// aliases with default allocator for convenience
|
|
391
448
|
using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
392
449
|
using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
393
450
|
using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
451
|
+
using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
|
|
394
452
|
|
|
395
453
|
} /* namespace datasketches */
|
|
396
454
|
|