datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
add_executable(sampling_test)
|
|
19
|
+
|
|
20
|
+
target_link_libraries(sampling_test sampling common_test)
|
|
21
|
+
|
|
22
|
+
set_target_properties(sampling_test PROPERTIES
|
|
23
|
+
CXX_STANDARD 11
|
|
24
|
+
CXX_STANDARD_REQUIRED YES
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" SAMPLING_TEST_BINARY_PATH)
|
|
28
|
+
string(APPEND SAMPLING_TEST_BINARY_PATH "/")
|
|
29
|
+
target_compile_definitions(sampling_test
|
|
30
|
+
PRIVATE
|
|
31
|
+
TEST_BINARY_INPUT_PATH="${SAMPLING_TEST_BINARY_PATH}"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
add_test(
|
|
35
|
+
NAME sampling_test
|
|
36
|
+
COMMAND sampling_test
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
target_sources(sampling_test
|
|
40
|
+
PRIVATE
|
|
41
|
+
var_opt_sketch_test.cpp
|
|
42
|
+
var_opt_union_test.cpp
|
|
43
|
+
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
Code snippets used to generate to generate the binary images from Java.
|
|
21
|
+
Heavy items have negative weights to allow a simple predicate to filter
|
|
22
|
+
heavy vs light sketch entires.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
varopt_sketch_long_sampling.bin:
|
|
26
|
+
final VarOptItemsSketch<String> sk = VarOptItemsSketch.newInstance(1024);
|
|
27
|
+
for (int i = 1; i <= 200; ++i) {
|
|
28
|
+
sk.update(Integer.toString(i), 1000.0 / i);
|
|
29
|
+
}
|
|
30
|
+
byte[] bytes = sk.toByteArray(new ArrayOfStringsSerDe());
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
varopt_sketch_string_exact.bin:
|
|
34
|
+
final VarOptItemsSketch<Long> sk = VarOptItemsSketch.newInstance(1024);
|
|
35
|
+
for (long i = 0; i < 2000; ++i) {
|
|
36
|
+
sk.update(i, 1.0);
|
|
37
|
+
}
|
|
38
|
+
sk.update(-1L, 100000.0);
|
|
39
|
+
sk.update(-2L, 110000.0);
|
|
40
|
+
sk.update(-3L, 120000.0);
|
|
41
|
+
byte[] bytes = sk.toByteArray(new ArrayOfLongsSerDe());
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
varopt_union_double_sampling.bin:
|
|
45
|
+
// parallels small samplign sketch test
|
|
46
|
+
final int kSmall = 16;
|
|
47
|
+
final int n1 = 32;
|
|
48
|
+
final int n2 = 64;
|
|
49
|
+
final int kMax = 128;
|
|
50
|
+
|
|
51
|
+
// small k sketch, but sampling
|
|
52
|
+
VarOptItemsSketch<Double> sketch = VarOptItemsSketch.newInstance(kSmall);
|
|
53
|
+
for (int i = 0; i < n1; ++i) {
|
|
54
|
+
sketch.update(1.0 * i, 1.0);
|
|
55
|
+
}
|
|
56
|
+
sketch.update(-1.0, n1 * n1); // add a heavy item
|
|
57
|
+
|
|
58
|
+
final VarOptItemsUnion<Double> union = VarOptItemsUnion.newInstance(kMax);
|
|
59
|
+
union.update(sketch);
|
|
60
|
+
|
|
61
|
+
// another one, but different n to get a different per-item weight
|
|
62
|
+
sketch = VarOptItemsSketch.newInstance(kSmall);
|
|
63
|
+
for (int i = 0; i < n2; ++i) {
|
|
64
|
+
sketch.update(1.0 * i, 1.0);
|
|
65
|
+
}
|
|
66
|
+
union.update(sketch);
|
|
67
|
+
byte[] bytes = union.toByteArray(new ArrayOfDoublesSerDe());
|
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <var_opt_sketch.hpp>
|
|
21
|
+
|
|
22
|
+
#include <catch.hpp>
|
|
23
|
+
|
|
24
|
+
#include <vector>
|
|
25
|
+
#include <string>
|
|
26
|
+
#include <sstream>
|
|
27
|
+
#include <fstream>
|
|
28
|
+
#include <cmath>
|
|
29
|
+
#include <random>
|
|
30
|
+
|
|
31
|
+
#ifdef TEST_BINARY_INPUT_PATH
|
|
32
|
+
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
|
|
33
|
+
#else
|
|
34
|
+
static std::string testBinaryInputPath = "test/";
|
|
35
|
+
#endif
|
|
36
|
+
|
|
37
|
+
namespace datasketches {
|
|
38
|
+
|
|
39
|
+
static constexpr double EPS = 1e-13;
|
|
40
|
+
|
|
41
|
+
static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
|
42
|
+
var_opt_sketch<int> sk(k);
|
|
43
|
+
for (uint64_t i = 0; i < n; ++i) {
|
|
44
|
+
sk.update(i, 1.0);
|
|
45
|
+
}
|
|
46
|
+
return sk;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
template<typename T, typename S, typename A>
|
|
50
|
+
static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk2) {
|
|
51
|
+
REQUIRE(sk1.get_k() == sk2.get_k());
|
|
52
|
+
REQUIRE(sk1.get_n() == sk2.get_n());
|
|
53
|
+
REQUIRE(sk1.get_num_samples() == sk2.get_num_samples());
|
|
54
|
+
|
|
55
|
+
auto it1 = sk1.begin();
|
|
56
|
+
auto it2 = sk2.begin();
|
|
57
|
+
size_t i = 0;
|
|
58
|
+
|
|
59
|
+
while ((it1 != sk1.end()) && (it2 != sk2.end())) {
|
|
60
|
+
const std::pair<const T&, const double> p1 = *it1;
|
|
61
|
+
const std::pair<const T&, const double> p2 = *it2;
|
|
62
|
+
REQUIRE(p1.first == p2.first); // data values
|
|
63
|
+
REQUIRE(p1.second == p2.second); // weights
|
|
64
|
+
++i;
|
|
65
|
+
++it1;
|
|
66
|
+
++it2;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
REQUIRE((it1 == sk1.end() && it2 == sk2.end())); // iterators must end at the same time
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
|
|
73
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
|
|
74
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>(1 << 31), std::invalid_argument); // aka k < 0
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
|
|
78
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
|
|
79
|
+
std::vector<uint8_t> bytes = sk.serialize();
|
|
80
|
+
bytes[1] = 0; // corrupt the serialization version byte
|
|
81
|
+
|
|
82
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
83
|
+
|
|
84
|
+
// create a stringstream to check the same
|
|
85
|
+
std::stringstream ss;
|
|
86
|
+
std::string str(bytes.begin(), bytes.end());
|
|
87
|
+
ss.str(str);
|
|
88
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
TEST_CASE("varopt sketch: bad family", "[var_opt_sketch]") {
|
|
92
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(16, 16);
|
|
93
|
+
std::vector<uint8_t> bytes = sk.serialize();
|
|
94
|
+
bytes[2] = 0; // corrupt the family byte
|
|
95
|
+
|
|
96
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
97
|
+
|
|
98
|
+
// create a stringstream to check the same
|
|
99
|
+
std::stringstream ss;
|
|
100
|
+
std::string str(bytes.begin(), bytes.end());
|
|
101
|
+
ss.str(str);
|
|
102
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
TEST_CASE("varopt sketch: bad prelongs", "[var_opt_sketch]") {
|
|
106
|
+
// The nubmer of preamble longs shares bits with resize_factor, but the latter
|
|
107
|
+
// has no invalid values as it gets 2 bites for 4 enum values.
|
|
108
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(32, 33);
|
|
109
|
+
std::vector<uint8_t> bytes = sk.serialize();
|
|
110
|
+
|
|
111
|
+
bytes[0] = 0; // corrupt the preamble longs byte to be too small
|
|
112
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
113
|
+
|
|
114
|
+
bytes[0] = 2; // corrupt the preamble longs byte to 2
|
|
115
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
116
|
+
|
|
117
|
+
bytes[0] = 5; // corrupt the preamble longs byte to be too large
|
|
118
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
TEST_CASE("varopt sketch: malformed preamble", "[var_opt_sketch]") {
|
|
122
|
+
uint32_t k = 50;
|
|
123
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(k, k);
|
|
124
|
+
const std::vector<uint8_t> src_bytes = sk.serialize();
|
|
125
|
+
|
|
126
|
+
// we'll re-use the same bytes several times so we'll use copies
|
|
127
|
+
std::vector<uint8_t> bytes(src_bytes);
|
|
128
|
+
|
|
129
|
+
// no items in R, but preamble longs indicates full
|
|
130
|
+
bytes[0] = 4; // PREAMBLE_LONGS_FULL
|
|
131
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
132
|
+
|
|
133
|
+
// k = 0
|
|
134
|
+
bytes = src_bytes;
|
|
135
|
+
*reinterpret_cast<int32_t*>(&bytes[4]) = 0;
|
|
136
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
137
|
+
|
|
138
|
+
// negative H region count in Java (signed ints)
|
|
139
|
+
// throws due to H count != n in exact mode
|
|
140
|
+
bytes = src_bytes;
|
|
141
|
+
*reinterpret_cast<int32_t*>(&bytes[16]) = -1;
|
|
142
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
143
|
+
|
|
144
|
+
// negative R region count in Java (signed ints)
|
|
145
|
+
// throws due to non-zero R in sampling mode
|
|
146
|
+
bytes = src_bytes;
|
|
147
|
+
*reinterpret_cast<int32_t*>(&bytes[20]) = -128;
|
|
148
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
TEST_CASE("varopt sketch: empty sketch", "[var_opt_sketch]") {
|
|
152
|
+
var_opt_sketch<std::string> sk(5);
|
|
153
|
+
REQUIRE(sk.get_n() == 0);
|
|
154
|
+
REQUIRE(sk.get_num_samples() == 0);
|
|
155
|
+
|
|
156
|
+
std::vector<uint8_t> bytes = sk.serialize();
|
|
157
|
+
REQUIRE(bytes.size() == (1 << 3)); // num bytes in PREAMBLE_LONGS_EMPTY
|
|
158
|
+
|
|
159
|
+
var_opt_sketch<std::string> loaded_sk = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
|
|
160
|
+
REQUIRE(loaded_sk.get_n() == 0);
|
|
161
|
+
REQUIRE(loaded_sk.get_num_samples() == 0);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
TEST_CASE("varopt sketch: non-empty degenerate sketch", "[var_opt_sketch]") {
|
|
165
|
+
// Make an empty serialized sketch, then extend it to a
|
|
166
|
+
// PREAMBLE_LONGS_WARMUP-sized byte array, with no items.
|
|
167
|
+
// Then clear the empty flag so it will try to load the rest.
|
|
168
|
+
var_opt_sketch<std::string> sk(12, resize_factor::X2);
|
|
169
|
+
std::vector<uint8_t> bytes = sk.serialize();
|
|
170
|
+
while (bytes.size() < 24) { // PREAMBLE_LONGS_WARMUP * 8
|
|
171
|
+
bytes.push_back((uint8_t) 0);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// ensure non-empty -- H and R region sizes already set to 0
|
|
175
|
+
bytes[3] = 0; // set flags bit to not-empty (other bits should already be 0)
|
|
176
|
+
|
|
177
|
+
REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") {
|
|
181
|
+
var_opt_sketch<std::string> sk(100, resize_factor::X2);
|
|
182
|
+
REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
|
|
183
|
+
|
|
184
|
+
// should not throw but sketch shoulds till be empty
|
|
185
|
+
sk.update("zero weight", 0.0);
|
|
186
|
+
REQUIRE(sk.is_empty());
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
TEST_CASE("varopt sketch: corrupt serialized weight", "[var_opt_sketch]") {
|
|
190
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(100, 20);
|
|
191
|
+
auto bytes = sk.serialize();
|
|
192
|
+
|
|
193
|
+
// weights are in the first double after the preamble
|
|
194
|
+
size_t preamble_bytes = (bytes[0] & 0x3f) << 3;
|
|
195
|
+
*reinterpret_cast<double*>(&bytes[preamble_bytes]) = -1.5;
|
|
196
|
+
|
|
197
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
|
|
198
|
+
|
|
199
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
|
200
|
+
for (auto& b : bytes) { ss >> b; }
|
|
201
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::invalid_argument);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
|
|
205
|
+
uint32_t k = 256;
|
|
206
|
+
uint64_t n = 10 * k;
|
|
207
|
+
var_opt_sketch<int> sk(k);
|
|
208
|
+
|
|
209
|
+
std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
|
|
210
|
+
std::mt19937_64 rand(rd());
|
|
211
|
+
std::normal_distribution<double> N(0.0, 1.0);
|
|
212
|
+
|
|
213
|
+
double input_sum = 0.0;
|
|
214
|
+
for (size_t i = 0; i < n; ++i) {
|
|
215
|
+
// generate weights aboev and below 1.0 using w ~ exp(5*N(0,1))
|
|
216
|
+
// which covers about 10 orders of magnitude
|
|
217
|
+
double w = std::exp(5 * N(rand));
|
|
218
|
+
input_sum += w;
|
|
219
|
+
sk.update(i, w);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
double output_sum = 0.0;
|
|
223
|
+
for (auto& it : sk) { // std::pair<int, weight>
|
|
224
|
+
output_sum += it.second;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
double weight_ratio = output_sum / input_sum;
|
|
228
|
+
REQUIRE(std::abs(weight_ratio - 1.0) == Approx(0).margin(EPS));
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
TEST_CASE("varopt sketch: under-full sketch serialization", "[var_opt_sketch]") {
|
|
232
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(100, 10); // need n < k
|
|
233
|
+
|
|
234
|
+
auto bytes = sk.serialize();
|
|
235
|
+
var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
|
|
236
|
+
check_if_equal(sk, sk_from_bytes);
|
|
237
|
+
|
|
238
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
|
239
|
+
sk.serialize(ss);
|
|
240
|
+
var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
|
|
241
|
+
check_if_equal(sk, sk_from_stream);
|
|
242
|
+
|
|
243
|
+
// ensure we unroll properly
|
|
244
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
245
|
+
std::string str_trunc((char*)&bytes[0], bytes.size() - 1);
|
|
246
|
+
ss.str(str_trunc);
|
|
247
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
TEST_CASE("varopt sketch: end-of-warmup sketch serialization", "[var_opt_sketch]") {
|
|
251
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(2843, 2843); // need n == k
|
|
252
|
+
auto bytes = sk.serialize();
|
|
253
|
+
|
|
254
|
+
// ensure still only 3 preamble longs
|
|
255
|
+
REQUIRE((bytes.data()[0] & 0x3f) == 3); // PREAMBLE_LONGS_WARMUP
|
|
256
|
+
|
|
257
|
+
var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
|
|
258
|
+
check_if_equal(sk, sk_from_bytes);
|
|
259
|
+
|
|
260
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
|
261
|
+
sk.serialize(ss);
|
|
262
|
+
var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
|
|
263
|
+
check_if_equal(sk, sk_from_stream);
|
|
264
|
+
|
|
265
|
+
// ensure we unroll properly
|
|
266
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 1000), std::out_of_range);
|
|
267
|
+
std::string str_trunc((char*)&bytes[0], bytes.size() - 100);
|
|
268
|
+
ss.str(str_trunc);
|
|
269
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
TEST_CASE("varopt sketch: full sketch serialization", "[var_opt_sketch]") {
|
|
273
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(32, 32);
|
|
274
|
+
sk.update(100, 100.0);
|
|
275
|
+
sk.update(101, 101.0);
|
|
276
|
+
|
|
277
|
+
// first 2 entries should be heavy and in heap order (smallest at root)
|
|
278
|
+
auto it = sk.begin();
|
|
279
|
+
const std::pair<const int, const double> p1 = *it;
|
|
280
|
+
++it;
|
|
281
|
+
const std::pair<const int, const double> p2 = *it;
|
|
282
|
+
REQUIRE(p1.second == Approx(100.0).margin(EPS));
|
|
283
|
+
REQUIRE(p2.second == Approx(101.0).margin(EPS));
|
|
284
|
+
REQUIRE(p1.first == 100);
|
|
285
|
+
REQUIRE(p2.first == 101);
|
|
286
|
+
|
|
287
|
+
// check for 4 preamble longs
|
|
288
|
+
auto bytes = sk.serialize();
|
|
289
|
+
REQUIRE((bytes.data()[0] & 0x3f) == 4);; // PREAMBLE_LONGS_WARMUP
|
|
290
|
+
|
|
291
|
+
var_opt_sketch<int> sk_from_bytes = var_opt_sketch<int>::deserialize(bytes.data(), bytes.size());
|
|
292
|
+
check_if_equal(sk, sk_from_bytes);
|
|
293
|
+
|
|
294
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
|
295
|
+
sk.serialize(ss);
|
|
296
|
+
var_opt_sketch<int> sk_from_stream = var_opt_sketch<int>::deserialize(ss);
|
|
297
|
+
check_if_equal(sk, sk_from_stream);
|
|
298
|
+
|
|
299
|
+
// ensure we unroll properly
|
|
300
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(bytes.data(), bytes.size() - 100), std::out_of_range);
|
|
301
|
+
std::string str_trunc((char*)&bytes[0], bytes.size() - 100);
|
|
302
|
+
ss.str(str_trunc);
|
|
303
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>::deserialize(ss), std::runtime_error);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
TEST_CASE("varopt sketch: string serialization", "[var_opt_sketch]") {
|
|
307
|
+
var_opt_sketch<std::string> sk(5);
|
|
308
|
+
sk.update("a", 1.0);
|
|
309
|
+
sk.update("bc", 1.0);
|
|
310
|
+
sk.update("def", 1.0);
|
|
311
|
+
sk.update("ghij", 1.0);
|
|
312
|
+
sk.update("klmno", 1.0);
|
|
313
|
+
sk.update("heavy item", 100.0);
|
|
314
|
+
|
|
315
|
+
auto bytes = sk.serialize();
|
|
316
|
+
var_opt_sketch<std::string> sk_from_bytes = var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size());
|
|
317
|
+
check_if_equal(sk, sk_from_bytes);
|
|
318
|
+
|
|
319
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
|
320
|
+
sk.serialize(ss);
|
|
321
|
+
var_opt_sketch<std::string> sk_from_stream = var_opt_sketch<std::string>::deserialize(ss);
|
|
322
|
+
check_if_equal(sk, sk_from_stream);
|
|
323
|
+
|
|
324
|
+
// ensure we unroll properly
|
|
325
|
+
REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(bytes.data(), bytes.size() - 12), std::out_of_range);
|
|
326
|
+
std::string str_trunc((char*)&bytes[0], bytes.size() - 12);
|
|
327
|
+
ss.str(str_trunc);
|
|
328
|
+
REQUIRE_THROWS_AS(var_opt_sketch<std::string>::deserialize(ss), std::runtime_error);
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
TEST_CASE("varopt sketch: pseudo-light update", "[var_opt_sketch]") {
|
|
332
|
+
uint32_t k = 1024;
|
|
333
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
|
|
334
|
+
sk.update(0, 1.0); // k+2nd update
|
|
335
|
+
|
|
336
|
+
// check the first weight, assuming all k items are unweighted
|
|
337
|
+
// (and consequently in R).
|
|
338
|
+
// Expected: (k + 2) / |R| = (k + 2) / k
|
|
339
|
+
auto it = sk.begin();
|
|
340
|
+
double wt = (*it).second;
|
|
341
|
+
REQUIRE(wt == Approx((k + 2.0) / k).margin(EPS));
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
|
|
345
|
+
uint32_t k = 1024;
|
|
346
|
+
double wt_scale = 10.0 * k;
|
|
347
|
+
var_opt_sketch<int> sk = create_unweighted_sketch(k, k + 1);
|
|
348
|
+
|
|
349
|
+
// Next k-1 updates should be update_pseudo_heavy_general()
|
|
350
|
+
// Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
|
|
351
|
+
// added k-1 heavy items, leaving only 1 item left in R
|
|
352
|
+
for (uint32_t i = 1; i <= k; ++i) {
|
|
353
|
+
sk.update(-i, k + (i * wt_scale));
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
auto it = sk.begin();
|
|
357
|
+
|
|
358
|
+
// Expected: lightest "heavy" item (first one out): k + 2*wt_scale
|
|
359
|
+
double wt = (*it).second;
|
|
360
|
+
REQUIRE(wt == Approx(1.0 * (k + (2 * wt_scale))).margin(EPS));
|
|
361
|
+
|
|
362
|
+
// we don't know which R item is left, but there should be only one, at the end
|
|
363
|
+
// of the sample set.
|
|
364
|
+
// Expected: k+1 + (min "heavy" item) / |R| = ((k+1) + (k*wt_scale)) / 1 = wt_scale + 2k + 1
|
|
365
|
+
while (it != sk.end()) {
|
|
366
|
+
wt = (*it).second;
|
|
367
|
+
++it;
|
|
368
|
+
}
|
|
369
|
+
REQUIRE(wt == Approx(1.0 + wt_scale + (2 * k)).margin(EPS));
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
TEST_CASE("varopt sketch: reset", "[var_opt_sketch]") {
|
|
373
|
+
uint32_t k = 1024;
|
|
374
|
+
uint64_t n1 = 20;
|
|
375
|
+
uint64_t n2 = 2 * k;
|
|
376
|
+
var_opt_sketch<std::string> sk(k);
|
|
377
|
+
|
|
378
|
+
// reset from sampling mode
|
|
379
|
+
for (uint64_t i = 0; i < n2; ++i) {
|
|
380
|
+
sk.update(std::to_string(i), 100.0 + i);
|
|
381
|
+
}
|
|
382
|
+
REQUIRE(sk.get_n() == n2);
|
|
383
|
+
REQUIRE(sk.get_k() == k);
|
|
384
|
+
|
|
385
|
+
sk.reset();
|
|
386
|
+
REQUIRE(sk.get_n() == 0);
|
|
387
|
+
REQUIRE(sk.get_k() == k);
|
|
388
|
+
|
|
389
|
+
// reset from exact mode
|
|
390
|
+
for (uint64_t i = 0; i < n1; ++i)
|
|
391
|
+
sk.update(std::to_string(i));
|
|
392
|
+
REQUIRE(sk.get_n() == n1);
|
|
393
|
+
REQUIRE(sk.get_k() == k);
|
|
394
|
+
|
|
395
|
+
sk.reset();
|
|
396
|
+
REQUIRE(sk.get_n() == 0);
|
|
397
|
+
REQUIRE(sk.get_k() == k);
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
|
|
401
|
+
uint32_t k = 10;
|
|
402
|
+
var_opt_sketch<int> sk(k);
|
|
403
|
+
|
|
404
|
+
// empty sketch -- all zeros
|
|
405
|
+
subset_summary summary = sk.estimate_subset_sum([](int){ return true; });
|
|
406
|
+
REQUIRE(summary.estimate == 0.0);
|
|
407
|
+
REQUIRE(summary.total_sketch_weight == 0.0);
|
|
408
|
+
|
|
409
|
+
// add items, keeping in exact mode
|
|
410
|
+
double total_weight = 0.0;
|
|
411
|
+
for (uint32_t i = 1; i <= (k - 1); ++i) {
|
|
412
|
+
sk.update(i, 1.0 * i);
|
|
413
|
+
total_weight += 1.0 * i;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
summary = sk.estimate_subset_sum([](int){ return true; });
|
|
417
|
+
REQUIRE(summary.estimate == total_weight);
|
|
418
|
+
REQUIRE(summary.lower_bound == total_weight);
|
|
419
|
+
REQUIRE(summary.upper_bound == total_weight);
|
|
420
|
+
REQUIRE(summary.total_sketch_weight == total_weight);
|
|
421
|
+
|
|
422
|
+
// add a few more items, pushing to sampling mode
|
|
423
|
+
for (uint32_t i = k; i <= (k + 1); ++i) {
|
|
424
|
+
sk.update(i, 1.0 * i);
|
|
425
|
+
total_weight += 1.0 * i;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// predicate always true so estimate == upper bound
|
|
429
|
+
summary = sk.estimate_subset_sum([](int){ return true; });
|
|
430
|
+
REQUIRE(summary.estimate == Approx(total_weight).margin(EPS));
|
|
431
|
+
REQUIRE(summary.upper_bound == Approx(total_weight).margin(EPS));
|
|
432
|
+
REQUIRE(summary.lower_bound < total_weight);
|
|
433
|
+
REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
|
|
434
|
+
|
|
435
|
+
// predicate always false so estimate == lower bound == 0.0
|
|
436
|
+
summary = sk.estimate_subset_sum([](int){ return false; });
|
|
437
|
+
REQUIRE(summary.estimate == 0.0);
|
|
438
|
+
REQUIRE(summary.lower_bound == 0.0);
|
|
439
|
+
REQUIRE(summary.upper_bound > 0.0);
|
|
440
|
+
REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
|
|
441
|
+
|
|
442
|
+
// finally, a non-degenerate predicate
|
|
443
|
+
// insert negative items with identical weights, filter for negative weights only
|
|
444
|
+
for (uint32_t i = 1; i <= (k + 1); ++i) {
|
|
445
|
+
sk.update(static_cast<int32_t>(-i), 1.0 * i);
|
|
446
|
+
total_weight += 1.0 * i;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
summary = sk.estimate_subset_sum([](int x) { return x < 0; });
|
|
450
|
+
REQUIRE(summary.estimate >= summary.lower_bound);
|
|
451
|
+
REQUIRE(summary.estimate <= summary.upper_bound);
|
|
452
|
+
|
|
453
|
+
// allow pretty generous bounds when testing
|
|
454
|
+
REQUIRE(summary.lower_bound < (total_weight / 1.4));
|
|
455
|
+
REQUIRE(summary.upper_bound > (total_weight / 2.6));
|
|
456
|
+
REQUIRE(summary.total_sketch_weight == Approx(total_weight).margin(EPS));
|
|
457
|
+
|
|
458
|
+
// and another data type, keeping it in exact mode for simplicity
|
|
459
|
+
var_opt_sketch<bool> sk2(k);
|
|
460
|
+
total_weight = 0.0;
|
|
461
|
+
for (uint32_t i = 1; i <= (k - 1); ++i) {
|
|
462
|
+
sk2.update((i % 2) == 0, 1.0 * i);
|
|
463
|
+
total_weight += i;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
summary = sk2.estimate_subset_sum([](bool b){ return !b; });
|
|
467
|
+
REQUIRE(summary.estimate == summary.lower_bound);
|
|
468
|
+
REQUIRE(summary.estimate == summary.upper_bound);
|
|
469
|
+
REQUIRE(summary.estimate < total_weight); // exact mode, so know it must be strictly less
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
TEST_CASE("varopt sketch: deserialize exact from java", "[var_opt_sketch]") {
|
|
473
|
+
std::ifstream is;
|
|
474
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
475
|
+
is.open(testBinaryInputPath + "varopt_sketch_string_exact.sk", std::ios::binary);
|
|
476
|
+
var_opt_sketch<std::string> sketch = var_opt_sketch<std::string>::deserialize(is);
|
|
477
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
478
|
+
REQUIRE(sketch.get_k() == 1024);
|
|
479
|
+
REQUIRE(sketch.get_n() == 200);
|
|
480
|
+
REQUIRE(sketch.get_num_samples() == 200);
|
|
481
|
+
subset_summary ss = sketch.estimate_subset_sum([](std::string){ return true; });
|
|
482
|
+
|
|
483
|
+
double tgt_wt = 0.0;
|
|
484
|
+
for (int i = 1; i <= 200; ++i) { tgt_wt += 1000.0 / i; }
|
|
485
|
+
REQUIRE(ss.total_sketch_weight == Approx(tgt_wt).margin(EPS));
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
TEST_CASE("varopt sketch: deserialize sampling from java", "[var_opt_sketch]") {
|
|
490
|
+
std::ifstream is;
|
|
491
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
492
|
+
is.open(testBinaryInputPath + "varopt_sketch_long_sampling.sk", std::ios::binary);
|
|
493
|
+
var_opt_sketch<int64_t> sketch = var_opt_sketch<int64_t>::deserialize(is);
|
|
494
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
495
|
+
REQUIRE(sketch.get_k() == 1024);
|
|
496
|
+
REQUIRE(sketch.get_n() == 2003);
|
|
497
|
+
REQUIRE(sketch.get_num_samples() == sketch.get_k());
|
|
498
|
+
subset_summary ss = sketch.estimate_subset_sum([](int64_t){ return true; });
|
|
499
|
+
REQUIRE(ss.estimate == Approx(332000.0).margin(EPS));
|
|
500
|
+
REQUIRE(ss.total_sketch_weight == Approx(332000.0).margin(EPS));
|
|
501
|
+
|
|
502
|
+
ss = sketch.estimate_subset_sum([](int64_t x){ return x < 0; });
|
|
503
|
+
REQUIRE(ss.estimate == 330000.0); // heavy item, weight is exact
|
|
504
|
+
|
|
505
|
+
ss = sketch.estimate_subset_sum([](int64_t x){ return x >= 0; });
|
|
506
|
+
REQUIRE(ss.estimate == Approx(2000.0).margin(EPS));
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
}
|