datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <iostream>
|
|
21
|
+
|
|
22
|
+
#include <catch.hpp>
|
|
23
|
+
#include <jaccard_similarity.hpp>
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
using update_theta_sketch = update_theta_sketch_experimental<>;
|
|
28
|
+
|
|
29
|
+
TEST_CASE("theta jaccard: empty", "[theta_sketch]") {
|
|
30
|
+
auto sk_a = update_theta_sketch::builder().build();
|
|
31
|
+
auto sk_b = update_theta_sketch::builder().build();
|
|
32
|
+
auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b);
|
|
33
|
+
REQUIRE(jc == std::array<double, 3>{1, 1, 1});
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
TEST_CASE("theta jaccard: same sketch exact mode", "[theta_sketch]") {
|
|
37
|
+
auto sk = update_theta_sketch::builder().build();
|
|
38
|
+
for (int i = 0; i < 1000; ++i) sk.update(i);
|
|
39
|
+
|
|
40
|
+
// update sketch
|
|
41
|
+
auto jc = theta_jaccard_similarity::jaccard(sk, sk);
|
|
42
|
+
REQUIRE(jc == std::array<double, 3>{1, 1, 1});
|
|
43
|
+
|
|
44
|
+
// compact sketch
|
|
45
|
+
jc = theta_jaccard_similarity::jaccard(sk.compact(), sk.compact());
|
|
46
|
+
REQUIRE(jc == std::array<double, 3>{1, 1, 1});
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
TEST_CASE("theta jaccard: full overlap exact mode", "[theta_sketch]") {
|
|
50
|
+
auto sk_a = update_theta_sketch::builder().build();
|
|
51
|
+
auto sk_b = update_theta_sketch::builder().build();
|
|
52
|
+
for (int i = 0; i < 1000; ++i) {
|
|
53
|
+
sk_a.update(i);
|
|
54
|
+
sk_b.update(i);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// update sketches
|
|
58
|
+
auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b);
|
|
59
|
+
REQUIRE(jc == std::array<double, 3>{1, 1, 1});
|
|
60
|
+
|
|
61
|
+
// compact sketches
|
|
62
|
+
jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact());
|
|
63
|
+
REQUIRE(jc == std::array<double, 3>{1, 1, 1});
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
TEST_CASE("theta jaccard: disjoint exact mode", "[theta_sketch]") {
|
|
67
|
+
auto sk_a = update_theta_sketch::builder().build();
|
|
68
|
+
auto sk_b = update_theta_sketch::builder().build();
|
|
69
|
+
for (int i = 0; i < 1000; ++i) {
|
|
70
|
+
sk_a.update(i);
|
|
71
|
+
sk_b.update(i + 1000);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// update sketches
|
|
75
|
+
auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b);
|
|
76
|
+
REQUIRE(jc == std::array<double, 3>{0, 0, 0});
|
|
77
|
+
|
|
78
|
+
// compact sketches
|
|
79
|
+
jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact());
|
|
80
|
+
REQUIRE(jc == std::array<double, 3>{0, 0, 0});
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
TEST_CASE("theta jaccard: half overlap estimation mode", "[theta_sketch]") {
|
|
84
|
+
auto sk_a = update_theta_sketch::builder().build();
|
|
85
|
+
auto sk_b = update_theta_sketch::builder().build();
|
|
86
|
+
for (int i = 0; i < 10000; ++i) {
|
|
87
|
+
sk_a.update(i);
|
|
88
|
+
sk_b.update(i + 5000);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// update sketches
|
|
92
|
+
auto jc = theta_jaccard_similarity::jaccard(sk_a, sk_b);
|
|
93
|
+
REQUIRE(jc[0] == Approx(0.33).margin(0.01));
|
|
94
|
+
REQUIRE(jc[1] == Approx(0.33).margin(0.01));
|
|
95
|
+
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
|
96
|
+
|
|
97
|
+
// compact sketches
|
|
98
|
+
jc = theta_jaccard_similarity::jaccard(sk_a.compact(), sk_b.compact());
|
|
99
|
+
REQUIRE(jc[0] == Approx(0.33).margin(0.01));
|
|
100
|
+
REQUIRE(jc[1] == Approx(0.33).margin(0.01));
|
|
101
|
+
REQUIRE(jc[2] == Approx(0.33).margin(0.01));
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* The distribution is quite tight, about +/- 0.7%, which is pretty good since the accuracy of the
|
|
106
|
+
* underlying sketch is about +/- 1.56%.
|
|
107
|
+
*/
|
|
108
|
+
TEST_CASE("theta jaccard: similarity test", "[theta_sketch]") {
|
|
109
|
+
const int8_t min_lg_k = 12;
|
|
110
|
+
const int u1 = 1 << 20;
|
|
111
|
+
const int u2 = u1 * 0.95;
|
|
112
|
+
const double threshold = 0.943;
|
|
113
|
+
|
|
114
|
+
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
|
115
|
+
for (int i = 0; i < u1; ++i) expected.update(i);
|
|
116
|
+
|
|
117
|
+
auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
|
118
|
+
for (int i = 0; i < u2; ++i) actual.update(i);
|
|
119
|
+
|
|
120
|
+
REQUIRE(theta_jaccard_similarity::similarity_test(actual, expected, threshold));
|
|
121
|
+
REQUIRE(theta_jaccard_similarity::similarity_test(actual, actual, threshold));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* The distribution is much looser here, about +/- 14%. This is due to the fact that intersections loose accuracy
|
|
126
|
+
* as the ratio of intersection to the union becomes a small number.
|
|
127
|
+
*/
|
|
128
|
+
TEST_CASE("theta jaccard: dissimilarity test", "[theta_sketch]") {
|
|
129
|
+
const int8_t min_lg_k = 12;
|
|
130
|
+
const int u1 = 1 << 20;
|
|
131
|
+
const int u2 = u1 * 0.05;
|
|
132
|
+
const double threshold = 0.061;
|
|
133
|
+
|
|
134
|
+
auto expected = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
|
135
|
+
for (int i = 0; i < u1; ++i) expected.update(i);
|
|
136
|
+
|
|
137
|
+
auto actual = update_theta_sketch::builder().set_lg_k(min_lg_k).build();
|
|
138
|
+
for (int i = 0; i < u2; ++i) actual.update(i);
|
|
139
|
+
|
|
140
|
+
REQUIRE(theta_jaccard_similarity::dissimilarity_test(actual, expected, threshold));
|
|
141
|
+
REQUIRE_FALSE(theta_jaccard_similarity::dissimilarity_test(actual, actual, threshold));
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <fstream>
|
|
21
|
+
#include <sstream>
|
|
22
|
+
|
|
23
|
+
#include <catch.hpp>
|
|
24
|
+
#include <theta_sketch_experimental.hpp>
|
|
25
|
+
|
|
26
|
+
namespace datasketches {
|
|
27
|
+
|
|
28
|
+
#ifdef TEST_BINARY_INPUT_PATH
|
|
29
|
+
const std::string inputPath = TEST_BINARY_INPUT_PATH;
|
|
30
|
+
#else
|
|
31
|
+
const std::string inputPath = "test/";
|
|
32
|
+
#endif
|
|
33
|
+
|
|
34
|
+
// These tests have been copied from the existing theta sketch implementation.
|
|
35
|
+
// Serialization as base class and serialization of update sketch have been removed.
|
|
36
|
+
|
|
37
|
+
using update_theta_sketch = update_theta_sketch_experimental<>;
|
|
38
|
+
using compact_theta_sketch = compact_theta_sketch_experimental<>;
|
|
39
|
+
|
|
40
|
+
TEST_CASE("theta sketch: empty", "[theta_sketch]") {
|
|
41
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
42
|
+
REQUIRE(update_sketch.is_empty());
|
|
43
|
+
REQUIRE_FALSE(update_sketch.is_estimation_mode());
|
|
44
|
+
REQUIRE(update_sketch.get_theta() == 1.0);
|
|
45
|
+
REQUIRE(update_sketch.get_estimate() == 0.0);
|
|
46
|
+
REQUIRE(update_sketch.get_lower_bound(1) == 0.0);
|
|
47
|
+
REQUIRE(update_sketch.get_upper_bound(1) == 0.0);
|
|
48
|
+
|
|
49
|
+
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
50
|
+
REQUIRE(compact_sketch.is_empty());
|
|
51
|
+
REQUIRE_FALSE(compact_sketch.is_estimation_mode());
|
|
52
|
+
REQUIRE(compact_sketch.get_theta() == 1.0);
|
|
53
|
+
REQUIRE(compact_sketch.get_estimate() == 0.0);
|
|
54
|
+
REQUIRE(compact_sketch.get_lower_bound(1) == 0.0);
|
|
55
|
+
REQUIRE(compact_sketch.get_upper_bound(1) == 0.0);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
TEST_CASE("theta sketch: non empty no retained keys", "[theta_sketch]") {
|
|
59
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().set_p(0.001).build();
|
|
60
|
+
update_sketch.update(1);
|
|
61
|
+
//std::cerr << update_sketch.to_string();
|
|
62
|
+
REQUIRE(update_sketch.get_num_retained() == 0);
|
|
63
|
+
REQUIRE_FALSE(update_sketch.is_empty());
|
|
64
|
+
REQUIRE(update_sketch.is_estimation_mode());
|
|
65
|
+
REQUIRE(update_sketch.get_estimate() == 0.0);
|
|
66
|
+
REQUIRE(update_sketch.get_lower_bound(1) == 0.0);
|
|
67
|
+
REQUIRE(update_sketch.get_upper_bound(1) > 0);
|
|
68
|
+
|
|
69
|
+
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
70
|
+
REQUIRE(compact_sketch.get_num_retained() == 0);
|
|
71
|
+
REQUIRE_FALSE(compact_sketch.is_empty());
|
|
72
|
+
REQUIRE(compact_sketch.is_estimation_mode());
|
|
73
|
+
REQUIRE(compact_sketch.get_estimate() == 0.0);
|
|
74
|
+
REQUIRE(compact_sketch.get_lower_bound(1) == 0.0);
|
|
75
|
+
REQUIRE(compact_sketch.get_upper_bound(1) > 0);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
TEST_CASE("theta sketch: single item", "[theta_sketch]") {
|
|
79
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
80
|
+
update_sketch.update(1);
|
|
81
|
+
REQUIRE_FALSE(update_sketch.is_empty());
|
|
82
|
+
REQUIRE_FALSE(update_sketch.is_estimation_mode());
|
|
83
|
+
REQUIRE(update_sketch.get_theta() == 1.0);
|
|
84
|
+
REQUIRE(update_sketch.get_estimate() == 1.0);
|
|
85
|
+
REQUIRE(update_sketch.get_lower_bound(1) == 1.0);
|
|
86
|
+
REQUIRE(update_sketch.get_upper_bound(1) == 1.0);
|
|
87
|
+
|
|
88
|
+
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
89
|
+
REQUIRE_FALSE(compact_sketch.is_empty());
|
|
90
|
+
REQUIRE_FALSE(compact_sketch.is_estimation_mode());
|
|
91
|
+
REQUIRE(compact_sketch.get_theta() == 1.0);
|
|
92
|
+
REQUIRE(compact_sketch.get_estimate() == 1.0);
|
|
93
|
+
REQUIRE(compact_sketch.get_lower_bound(1) == 1.0);
|
|
94
|
+
REQUIRE(compact_sketch.get_upper_bound(1) == 1.0);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
TEST_CASE("theta sketch: resize exact", "[theta_sketch]") {
|
|
98
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
99
|
+
for (int i = 0; i < 2000; i++) update_sketch.update(i);
|
|
100
|
+
REQUIRE_FALSE(update_sketch.is_empty());
|
|
101
|
+
REQUIRE_FALSE(update_sketch.is_estimation_mode());
|
|
102
|
+
REQUIRE(update_sketch.get_theta() == 1.0);
|
|
103
|
+
REQUIRE(update_sketch.get_estimate() == 2000.0);
|
|
104
|
+
REQUIRE(update_sketch.get_lower_bound(1) == 2000.0);
|
|
105
|
+
REQUIRE(update_sketch.get_upper_bound(1) == 2000.0);
|
|
106
|
+
|
|
107
|
+
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
108
|
+
REQUIRE_FALSE(compact_sketch.is_empty());
|
|
109
|
+
REQUIRE_FALSE(compact_sketch.is_estimation_mode());
|
|
110
|
+
REQUIRE(compact_sketch.get_theta() == 1.0);
|
|
111
|
+
REQUIRE(compact_sketch.get_estimate() == 2000.0);
|
|
112
|
+
REQUIRE(compact_sketch.get_lower_bound(1) == 2000.0);
|
|
113
|
+
REQUIRE(compact_sketch.get_upper_bound(1) == 2000.0);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
TEST_CASE("theta sketch: estimation", "[theta_sketch]") {
|
|
117
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().set_resize_factor(update_theta_sketch::resize_factor::X1).build();
|
|
118
|
+
const int n = 8000;
|
|
119
|
+
for (int i = 0; i < n; i++) update_sketch.update(i);
|
|
120
|
+
//std::cerr << update_sketch.to_string();
|
|
121
|
+
REQUIRE_FALSE(update_sketch.is_empty());
|
|
122
|
+
REQUIRE(update_sketch.is_estimation_mode());
|
|
123
|
+
REQUIRE(update_sketch.get_theta() < 1.0);
|
|
124
|
+
REQUIRE(update_sketch.get_estimate() == Approx((double) n).margin(n * 0.01));
|
|
125
|
+
REQUIRE(update_sketch.get_lower_bound(1) < n);
|
|
126
|
+
REQUIRE(update_sketch.get_upper_bound(1) > n);
|
|
127
|
+
|
|
128
|
+
const uint32_t k = 1 << update_theta_sketch::builder::DEFAULT_LG_K;
|
|
129
|
+
REQUIRE(update_sketch.get_num_retained() >= k);
|
|
130
|
+
update_sketch.trim();
|
|
131
|
+
REQUIRE(update_sketch.get_num_retained() == k);
|
|
132
|
+
|
|
133
|
+
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
134
|
+
REQUIRE_FALSE(compact_sketch.is_empty());
|
|
135
|
+
REQUIRE(compact_sketch.is_ordered());
|
|
136
|
+
REQUIRE(compact_sketch.is_estimation_mode());
|
|
137
|
+
REQUIRE(compact_sketch.get_theta() < 1.0);
|
|
138
|
+
REQUIRE(compact_sketch.get_estimate() == Approx((double) n).margin(n * 0.01));
|
|
139
|
+
REQUIRE(compact_sketch.get_lower_bound(1) < n);
|
|
140
|
+
REQUIRE(compact_sketch.get_upper_bound(1) > n);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
TEST_CASE("theta sketch: deserialize compact empty from java", "[theta_sketch]") {
|
|
144
|
+
std::ifstream is;
|
|
145
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
146
|
+
is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary);
|
|
147
|
+
auto sketch = compact_theta_sketch::deserialize(is);
|
|
148
|
+
REQUIRE(sketch.is_empty());
|
|
149
|
+
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
150
|
+
REQUIRE(sketch.get_num_retained() == 0);
|
|
151
|
+
REQUIRE(sketch.get_theta() == 1.0);
|
|
152
|
+
REQUIRE(sketch.get_estimate() == 0.0);
|
|
153
|
+
REQUIRE(sketch.get_lower_bound(1) == 0.0);
|
|
154
|
+
REQUIRE(sketch.get_upper_bound(1) == 0.0);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
TEST_CASE("theta sketch: deserialize single item from java", "[theta_sketch]") {
|
|
158
|
+
std::ifstream is;
|
|
159
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
160
|
+
is.open(inputPath + "theta_compact_single_item_from_java.sk", std::ios::binary);
|
|
161
|
+
auto sketch = compact_theta_sketch::deserialize(is);
|
|
162
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
163
|
+
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
164
|
+
REQUIRE(sketch.get_num_retained() == 1);
|
|
165
|
+
REQUIRE(sketch.get_theta() == 1.0);
|
|
166
|
+
REQUIRE(sketch.get_estimate() == 1.0);
|
|
167
|
+
REQUIRE(sketch.get_lower_bound(1) == 1.0);
|
|
168
|
+
REQUIRE(sketch.get_upper_bound(1) == 1.0);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
TEST_CASE("theta sketch: deserialize compact estimation from java", "[theta_sketch]") {
|
|
172
|
+
std::ifstream is;
|
|
173
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
174
|
+
is.open(inputPath + "theta_compact_estimation_from_java.sk", std::ios::binary);
|
|
175
|
+
auto sketch = compact_theta_sketch::deserialize(is);
|
|
176
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
177
|
+
REQUIRE(sketch.is_estimation_mode());
|
|
178
|
+
REQUIRE(sketch.is_ordered());
|
|
179
|
+
REQUIRE(sketch.get_num_retained() == 4342);
|
|
180
|
+
REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
|
|
181
|
+
REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
|
|
182
|
+
REQUIRE(sketch.get_lower_bound(2) == Approx(7996.956955317471).margin(1e-10));
|
|
183
|
+
REQUIRE(sketch.get_upper_bound(2) == Approx(8339.090301078124).margin(1e-10));
|
|
184
|
+
|
|
185
|
+
// the same construction process in Java must have produced exactly the same sketch
|
|
186
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
187
|
+
const int n = 8192;
|
|
188
|
+
for (int i = 0; i < n; i++) update_sketch.update(i);
|
|
189
|
+
REQUIRE(sketch.get_num_retained() == update_sketch.get_num_retained());
|
|
190
|
+
REQUIRE(sketch.get_theta() == Approx(update_sketch.get_theta()).margin(1e-10));
|
|
191
|
+
REQUIRE(sketch.get_estimate() == Approx(update_sketch.get_estimate()).margin(1e-10));
|
|
192
|
+
REQUIRE(sketch.get_lower_bound(1) == Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
|
|
193
|
+
REQUIRE(sketch.get_upper_bound(1) == Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
|
|
194
|
+
REQUIRE(sketch.get_lower_bound(2) == Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
|
|
195
|
+
REQUIRE(sketch.get_upper_bound(2) == Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
|
|
196
|
+
REQUIRE(sketch.get_lower_bound(3) == Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
|
|
197
|
+
REQUIRE(sketch.get_upper_bound(3) == Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
|
|
198
|
+
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
199
|
+
// the sketches are ordered, so the iteration sequence must match exactly
|
|
200
|
+
auto iter = sketch.begin();
|
|
201
|
+
for (const auto& key: compact_sketch) {
|
|
202
|
+
REQUIRE(*iter == key);
|
|
203
|
+
++iter;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[theta_sketch]") {
|
|
208
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
209
|
+
const int n = 8192;
|
|
210
|
+
for (int i = 0; i < n; i++) update_sketch.update(i);
|
|
211
|
+
|
|
212
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
213
|
+
update_sketch.compact().serialize(s);
|
|
214
|
+
auto bytes = update_sketch.compact().serialize();
|
|
215
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
|
216
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
|
217
|
+
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
s.seekg(0); // rewind
|
|
221
|
+
compact_theta_sketch deserialized_sketch1 = compact_theta_sketch::deserialize(s);
|
|
222
|
+
compact_theta_sketch deserialized_sketch2 = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
|
223
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
|
224
|
+
REQUIRE(deserialized_sketch2.is_empty() == deserialized_sketch1.is_empty());
|
|
225
|
+
REQUIRE(deserialized_sketch2.is_ordered() == deserialized_sketch1.is_ordered());
|
|
226
|
+
REQUIRE(deserialized_sketch2.get_num_retained() == deserialized_sketch1.get_num_retained());
|
|
227
|
+
REQUIRE(deserialized_sketch2.get_theta() == deserialized_sketch1.get_theta());
|
|
228
|
+
REQUIRE(deserialized_sketch2.get_estimate() == deserialized_sketch1.get_estimate());
|
|
229
|
+
REQUIRE(deserialized_sketch2.get_lower_bound(1) == deserialized_sketch1.get_lower_bound(1));
|
|
230
|
+
REQUIRE(deserialized_sketch2.get_upper_bound(1) == deserialized_sketch1.get_upper_bound(1));
|
|
231
|
+
// the sketches are ordered, so the iteration sequence must match exactly
|
|
232
|
+
auto iter = deserialized_sketch1.begin();
|
|
233
|
+
for (auto key: deserialized_sketch2) {
|
|
234
|
+
REQUIRE(*iter == key);
|
|
235
|
+
++iter;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") {
|
|
240
|
+
update_theta_sketch update_sketch = update_theta_sketch::builder().build();
|
|
241
|
+
update_sketch.update(1);
|
|
242
|
+
auto bytes = update_sketch.compact().serialize();
|
|
243
|
+
REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range);
|
|
244
|
+
REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <iostream>
|
|
21
|
+
|
|
22
|
+
#include <catch.hpp>
|
|
23
|
+
#include <tuple_union.hpp>
|
|
24
|
+
|
|
25
|
+
#include <theta_union_experimental.hpp>
|
|
26
|
+
|
|
27
|
+
namespace datasketches {
|
|
28
|
+
|
|
29
|
+
TEST_CASE("theta_union_exeperimental") {
|
|
30
|
+
auto update_sketch1 = update_theta_sketch_experimental<>::builder().build();
|
|
31
|
+
update_sketch1.update(1);
|
|
32
|
+
update_sketch1.update(2);
|
|
33
|
+
|
|
34
|
+
auto update_sketch2 = update_theta_sketch_experimental<>::builder().build();
|
|
35
|
+
update_sketch2.update(1);
|
|
36
|
+
update_sketch2.update(3);
|
|
37
|
+
|
|
38
|
+
auto u = theta_union_experimental<>::builder().build();
|
|
39
|
+
u.update(update_sketch1);
|
|
40
|
+
u.update(update_sketch2);
|
|
41
|
+
auto r = u.get_result();
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <iostream>
|
|
21
|
+
|
|
22
|
+
#include <catch.hpp>
|
|
23
|
+
#include <tuple_a_not_b.hpp>
|
|
24
|
+
#include <theta_sketch_experimental.hpp>
|
|
25
|
+
|
|
26
|
+
namespace datasketches {
|
|
27
|
+
|
|
28
|
+
TEST_CASE("tuple a-not-b: empty", "[tuple_a_not_b]") {
|
|
29
|
+
auto a = update_tuple_sketch<float>::builder().build();
|
|
30
|
+
auto b = update_tuple_sketch<float>::builder().build();
|
|
31
|
+
tuple_a_not_b<float> a_not_b;
|
|
32
|
+
auto result = a_not_b.compute(a, b);
|
|
33
|
+
REQUIRE(result.get_num_retained() == 0);
|
|
34
|
+
REQUIRE(result.is_empty());
|
|
35
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
36
|
+
REQUIRE(result.get_estimate() == 0.0);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
TEST_CASE("tuple a-not-b: non empty no retained keys", "[tuple_a_not_b]") {
|
|
40
|
+
auto a = update_tuple_sketch<float>::builder().build();
|
|
41
|
+
a.update(1, 1);
|
|
42
|
+
auto b = update_tuple_sketch<float>::builder().set_p(0.001).build();
|
|
43
|
+
tuple_a_not_b<float> a_not_b;
|
|
44
|
+
|
|
45
|
+
// B is still empty
|
|
46
|
+
auto result = a_not_b.compute(a, b);
|
|
47
|
+
REQUIRE_FALSE(result.is_empty());
|
|
48
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
49
|
+
REQUIRE(result.get_num_retained() == 1);
|
|
50
|
+
REQUIRE(result.get_theta() == Approx(1).margin(1e-10));
|
|
51
|
+
REQUIRE(result.get_estimate() == 1.0);
|
|
52
|
+
|
|
53
|
+
// B is not empty in estimation mode and no entries
|
|
54
|
+
b.update(1, 1);
|
|
55
|
+
REQUIRE(b.get_num_retained() == 0);
|
|
56
|
+
|
|
57
|
+
result = a_not_b.compute(a, b);
|
|
58
|
+
REQUIRE_FALSE(result.is_empty());
|
|
59
|
+
REQUIRE(result.is_estimation_mode());
|
|
60
|
+
REQUIRE(result.get_num_retained() == 0);
|
|
61
|
+
REQUIRE(result.get_theta() == Approx(0.001).margin(1e-10));
|
|
62
|
+
REQUIRE(result.get_estimate() == 0.0);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
TEST_CASE("tuple a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
|
|
66
|
+
auto a = update_tuple_sketch<float>::builder().build();
|
|
67
|
+
int value = 0;
|
|
68
|
+
for (int i = 0; i < 1000; i++) a.update(value++, 1);
|
|
69
|
+
|
|
70
|
+
auto b = update_tuple_sketch<float>::builder().build();
|
|
71
|
+
value = 500;
|
|
72
|
+
for (int i = 0; i < 1000; i++) b.update(value++, 1);
|
|
73
|
+
|
|
74
|
+
tuple_a_not_b<float> a_not_b;
|
|
75
|
+
|
|
76
|
+
// unordered inputs, ordered result
|
|
77
|
+
auto result = a_not_b.compute(a, b);
|
|
78
|
+
REQUIRE_FALSE(result.is_empty());
|
|
79
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
80
|
+
REQUIRE(result.is_ordered());
|
|
81
|
+
REQUIRE(result.get_estimate() == 500.0);
|
|
82
|
+
|
|
83
|
+
// unordered inputs, unordered result
|
|
84
|
+
result = a_not_b.compute(a, b, false);
|
|
85
|
+
REQUIRE_FALSE(result.is_empty());
|
|
86
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
87
|
+
REQUIRE_FALSE(result.is_ordered());
|
|
88
|
+
REQUIRE(result.get_estimate() == 500.0);
|
|
89
|
+
|
|
90
|
+
// ordered inputs
|
|
91
|
+
result = a_not_b.compute(a.compact(), b.compact());
|
|
92
|
+
REQUIRE_FALSE(result.is_empty());
|
|
93
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
94
|
+
REQUIRE(result.is_ordered());
|
|
95
|
+
REQUIRE(result.get_estimate() == 500.0);
|
|
96
|
+
|
|
97
|
+
// A is ordered, so the result is ordered regardless
|
|
98
|
+
result = a_not_b.compute(a.compact(), b, false);
|
|
99
|
+
REQUIRE_FALSE(result.is_empty());
|
|
100
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
101
|
+
REQUIRE(result.is_ordered());
|
|
102
|
+
REQUIRE(result.get_estimate() == 500.0);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// needed until promotion of experimental to replace existing theta sketch
|
|
106
|
+
using update_theta_sketch = update_theta_sketch_experimental<>;
|
|
107
|
+
|
|
108
|
+
TEST_CASE("mixed a-not-b: exact mode half overlap", "[tuple_a_not_b]") {
|
|
109
|
+
auto a = update_tuple_sketch<float>::builder().build();
|
|
110
|
+
int value = 0;
|
|
111
|
+
for (int i = 0; i < 1000; i++) a.update(value++, 1);
|
|
112
|
+
|
|
113
|
+
auto b = update_theta_sketch::builder().build();
|
|
114
|
+
value = 500;
|
|
115
|
+
for (int i = 0; i < 1000; i++) b.update(value++);
|
|
116
|
+
|
|
117
|
+
tuple_a_not_b<float> a_not_b;
|
|
118
|
+
|
|
119
|
+
// unordered inputs, ordered result
|
|
120
|
+
auto result = a_not_b.compute(a, compact_tuple_sketch<float>(b, 1, false));
|
|
121
|
+
REQUIRE_FALSE(result.is_empty());
|
|
122
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
123
|
+
REQUIRE(result.is_ordered());
|
|
124
|
+
REQUIRE(result.get_estimate() == 500.0);
|
|
125
|
+
|
|
126
|
+
// unordered inputs, unordered result
|
|
127
|
+
result = a_not_b.compute(a, compact_tuple_sketch<float>(b, 1, false), false);
|
|
128
|
+
REQUIRE_FALSE(result.is_empty());
|
|
129
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
130
|
+
REQUIRE_FALSE(result.is_ordered());
|
|
131
|
+
REQUIRE(result.get_estimate() == 500.0);
|
|
132
|
+
|
|
133
|
+
// ordered inputs
|
|
134
|
+
result = a_not_b.compute(a.compact(), compact_tuple_sketch<float>(b.compact(), 1));
|
|
135
|
+
REQUIRE_FALSE(result.is_empty());
|
|
136
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
137
|
+
REQUIRE(result.is_ordered());
|
|
138
|
+
REQUIRE(result.get_estimate() == 500.0);
|
|
139
|
+
|
|
140
|
+
// A is ordered, so the result is ordered regardless
|
|
141
|
+
result = a_not_b.compute(a.compact(), compact_tuple_sketch<float>(b, 1, false), false);
|
|
142
|
+
REQUIRE_FALSE(result.is_empty());
|
|
143
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
144
|
+
REQUIRE(result.is_ordered());
|
|
145
|
+
REQUIRE(result.get_estimate() == 500.0);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
TEST_CASE("tuple a-not-b: exact mode disjoint", "[tuple_a_not_b]") {
|
|
149
|
+
auto a = update_tuple_sketch<float>::builder().build();
|
|
150
|
+
int value = 0;
|
|
151
|
+
for (int i = 0; i < 1000; i++) a.update(value++, 1);
|
|
152
|
+
|
|
153
|
+
auto b = update_tuple_sketch<float>::builder().build();
|
|
154
|
+
for (int i = 0; i < 1000; i++) b.update(value++, 1);
|
|
155
|
+
|
|
156
|
+
tuple_a_not_b<float> a_not_b;
|
|
157
|
+
|
|
158
|
+
// unordered inputs
|
|
159
|
+
auto result = a_not_b.compute(a, b);
|
|
160
|
+
REQUIRE_FALSE(result.is_empty());
|
|
161
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
162
|
+
REQUIRE(result.get_estimate() == 1000.0);
|
|
163
|
+
|
|
164
|
+
// ordered inputs
|
|
165
|
+
result = a_not_b.compute(a.compact(), b.compact());
|
|
166
|
+
REQUIRE_FALSE(result.is_empty());
|
|
167
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
168
|
+
REQUIRE(result.get_estimate() == 1000.0);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
TEST_CASE("tuple a-not-b: exact mode full overlap", "[tuple_a_not_b]") {
|
|
172
|
+
auto sketch = update_tuple_sketch<float>::builder().build();
|
|
173
|
+
int value = 0;
|
|
174
|
+
for (int i = 0; i < 1000; i++) sketch.update(value++, 1);
|
|
175
|
+
|
|
176
|
+
tuple_a_not_b<float> a_not_b;
|
|
177
|
+
|
|
178
|
+
// unordered inputs
|
|
179
|
+
auto result = a_not_b.compute(sketch, sketch);
|
|
180
|
+
REQUIRE(result.is_empty());
|
|
181
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
182
|
+
REQUIRE(result.get_estimate() == 0.0);
|
|
183
|
+
|
|
184
|
+
// ordered inputs
|
|
185
|
+
result = a_not_b.compute(sketch.compact(), sketch.compact());
|
|
186
|
+
REQUIRE(result.is_empty());
|
|
187
|
+
REQUIRE_FALSE(result.is_estimation_mode());
|
|
188
|
+
REQUIRE(result.get_estimate() == 0.0);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
TEST_CASE("tuple a-not-b: estimation mode half overlap", "[tuple_a_not_b]") {
|
|
192
|
+
auto a = update_tuple_sketch<float>::builder().build();
|
|
193
|
+
int value = 0;
|
|
194
|
+
for (int i = 0; i < 10000; i++) a.update(value++, 1);
|
|
195
|
+
|
|
196
|
+
auto b = update_tuple_sketch<float>::builder().build();
|
|
197
|
+
value = 5000;
|
|
198
|
+
for (int i = 0; i < 10000; i++) b.update(value++, 1);
|
|
199
|
+
|
|
200
|
+
tuple_a_not_b<float> a_not_b;
|
|
201
|
+
|
|
202
|
+
// unordered inputs
|
|
203
|
+
auto result = a_not_b.compute(a, b);
|
|
204
|
+
REQUIRE_FALSE(result.is_empty());
|
|
205
|
+
REQUIRE(result.is_estimation_mode());
|
|
206
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
207
|
+
|
|
208
|
+
// ordered inputs
|
|
209
|
+
result = a_not_b.compute(a.compact(), b.compact());
|
|
210
|
+
REQUIRE_FALSE(result.is_empty());
|
|
211
|
+
REQUIRE(result.is_estimation_mode());
|
|
212
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.02));
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
TEST_CASE("tuple a-not-b: estimation mode disjoint", "[tuple_a_not_b]") {
|
|
216
|
+
auto a = update_tuple_sketch<float>::builder().build();
|
|
217
|
+
int value = 0;
|
|
218
|
+
for (int i = 0; i < 10000; i++) a.update(value++, 1);
|
|
219
|
+
|
|
220
|
+
auto b = update_tuple_sketch<float>::builder().build();
|
|
221
|
+
for (int i = 0; i < 10000; i++) b.update(value++, 1);
|
|
222
|
+
|
|
223
|
+
tuple_a_not_b<float> a_not_b;
|
|
224
|
+
|
|
225
|
+
// unordered inputs
|
|
226
|
+
auto result = a_not_b.compute(a, b);
|
|
227
|
+
REQUIRE_FALSE(result.is_empty());
|
|
228
|
+
REQUIRE(result.is_estimation_mode());
|
|
229
|
+
REQUIRE(result.get_estimate() == Approx(10000).margin(10000 * 0.02));
|
|
230
|
+
|
|
231
|
+
// ordered inputs
|
|
232
|
+
result = a_not_b.compute(a.compact(), b.compact());
|
|
233
|
+
REQUIRE_FALSE(result.is_empty());
|
|
234
|
+
REQUIRE(result.is_estimation_mode());
|
|
235
|
+
REQUIRE(result.get_estimate() == Approx(10000).margin(10000 * 0.02));
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
TEST_CASE("tuple a-not-b: estimation mode full overlap", "[tuple_a_not_b]") {
|
|
239
|
+
auto sketch = update_tuple_sketch<float>::builder().build();
|
|
240
|
+
int value = 0;
|
|
241
|
+
for (int i = 0; i < 10000; i++) sketch.update(value++, 1);
|
|
242
|
+
|
|
243
|
+
tuple_a_not_b<float> a_not_b;
|
|
244
|
+
|
|
245
|
+
// unordered inputs
|
|
246
|
+
auto result = a_not_b.compute(sketch, sketch);
|
|
247
|
+
REQUIRE_FALSE(result.is_empty());
|
|
248
|
+
REQUIRE(result.is_estimation_mode());
|
|
249
|
+
REQUIRE(result.get_estimate() == 0.0);
|
|
250
|
+
|
|
251
|
+
// ordered inputs
|
|
252
|
+
result = a_not_b.compute(sketch.compact(), sketch.compact());
|
|
253
|
+
REQUIRE_FALSE(result.is_empty());
|
|
254
|
+
REQUIRE(result.is_estimation_mode());
|
|
255
|
+
REQUIRE(result.get_estimate() == 0.0);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
TEST_CASE("tuple a-not-b: seed mismatch", "[tuple_a_not_b]") {
|
|
259
|
+
auto sketch = update_tuple_sketch<float>::builder().build();
|
|
260
|
+
sketch.update(1, 1); // non-empty should not be ignored
|
|
261
|
+
tuple_a_not_b<float> a_not_b(123);
|
|
262
|
+
REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
TEST_CASE("tuple a-not-b: issue #152", "[tuple_a_not_b]") {
|
|
266
|
+
auto a = update_tuple_sketch<float>::builder().build();
|
|
267
|
+
int value = 0;
|
|
268
|
+
for (int i = 0; i < 10000; i++) a.update(value++, 1);
|
|
269
|
+
|
|
270
|
+
auto b = update_tuple_sketch<float>::builder().build();
|
|
271
|
+
value = 5000;
|
|
272
|
+
for (int i = 0; i < 25000; i++) b.update(value++, 1);
|
|
273
|
+
|
|
274
|
+
tuple_a_not_b<float> a_not_b;
|
|
275
|
+
|
|
276
|
+
// unordered inputs
|
|
277
|
+
auto result = a_not_b.compute(a, b);
|
|
278
|
+
REQUIRE_FALSE(result.is_empty());
|
|
279
|
+
REQUIRE(result.is_estimation_mode());
|
|
280
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
|
|
281
|
+
|
|
282
|
+
// ordered inputs
|
|
283
|
+
result = a_not_b.compute(a.compact(), b.compact());
|
|
284
|
+
REQUIRE_FALSE(result.is_empty());
|
|
285
|
+
REQUIRE(result.is_estimation_mode());
|
|
286
|
+
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
} /* namespace datasketches */
|