datasketches 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
add_library(sampling INTERFACE)
|
19
|
+
|
20
|
+
add_library(${PROJECT_NAME}::SAMPLING ALIAS sampling)
|
21
|
+
|
22
|
+
if (BUILD_TESTS)
|
23
|
+
add_subdirectory(test)
|
24
|
+
endif()
|
25
|
+
|
26
|
+
target_include_directories(sampling
|
27
|
+
INTERFACE
|
28
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
29
|
+
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
|
30
|
+
)
|
31
|
+
|
32
|
+
target_link_libraries(sampling INTERFACE common)
|
33
|
+
target_compile_features(sampling INTERFACE cxx_std_11)
|
34
|
+
|
35
|
+
set(sampling_HEADERS "include/var_opt_sketch.hpp;include/var_opt_sketch_impl.hpp")
|
36
|
+
|
37
|
+
install(TARGETS sampling
|
38
|
+
EXPORT ${PROJECT_NAME}
|
39
|
+
)
|
40
|
+
|
41
|
+
install(FILES ${sampling_HEADERS}
|
42
|
+
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
43
|
+
|
44
|
+
target_sources(sampling
|
45
|
+
INTERFACE
|
46
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch.hpp
|
47
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch_impl.hpp
|
48
|
+
)
|
@@ -0,0 +1,392 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _VAR_OPT_SKETCH_HPP_
|
21
|
+
#define _VAR_OPT_SKETCH_HPP_
|
22
|
+
|
23
|
+
#include "serde.hpp"
|
24
|
+
#include "common_defs.hpp"
|
25
|
+
|
26
|
+
#include <iterator>
|
27
|
+
#include <vector>
|
28
|
+
|
29
|
+
|
30
|
+
/**
|
31
|
+
* This sketch samples data from a stream of items, designed for optimal (minimum) variance when
|
32
|
+
* querying the sketch to estimate subset sums of items matchng a provided predicate. Variance
|
33
|
+
* optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
|
34
|
+
* subset sum estimation.
|
35
|
+
*
|
36
|
+
* author Kevin Lang
|
37
|
+
* author Jon Malkin
|
38
|
+
*/
|
39
|
+
namespace datasketches {
|
40
|
+
|
41
|
+
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
42
|
+
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
43
|
+
|
44
|
+
/**
|
45
|
+
* A struct to hold the result of subset sum queries
|
46
|
+
*/
|
47
|
+
struct subset_summary {
|
48
|
+
double lower_bound;
|
49
|
+
double estimate;
|
50
|
+
double upper_bound;
|
51
|
+
double total_sketch_weight;
|
52
|
+
};
|
53
|
+
|
54
|
+
enum resize_factor { X1 = 0, X2, X4, X8 };
|
55
|
+
|
56
|
+
template <typename T, typename S, typename A> class var_opt_union; // forward declaration
|
57
|
+
|
58
|
+
template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
|
59
|
+
class var_opt_sketch {
|
60
|
+
|
61
|
+
public:
|
62
|
+
static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
|
63
|
+
static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
64
|
+
|
65
|
+
explicit var_opt_sketch(uint32_t k, resize_factor rf = DEFAULT_RESIZE_FACTOR);
|
66
|
+
var_opt_sketch(const var_opt_sketch& other);
|
67
|
+
var_opt_sketch(var_opt_sketch&& other) noexcept;
|
68
|
+
|
69
|
+
~var_opt_sketch();
|
70
|
+
|
71
|
+
var_opt_sketch& operator=(const var_opt_sketch& other);
|
72
|
+
var_opt_sketch& operator=(var_opt_sketch&& other);
|
73
|
+
|
74
|
+
/**
|
75
|
+
* Updates this sketch with the given data item with the given weight.
|
76
|
+
* This method takes an lvalue.
|
77
|
+
* @param item an item from a stream of items
|
78
|
+
* @param weight the weight of the item
|
79
|
+
*/
|
80
|
+
void update(const T& item, double weight=1.0);
|
81
|
+
|
82
|
+
/**
|
83
|
+
* Updates this sketch with the given data item with the given weight.
|
84
|
+
* This method takes an rvalue.
|
85
|
+
* @param item an item from a stream of items
|
86
|
+
* @param weight the weight of the item
|
87
|
+
*/
|
88
|
+
void update(T&& item, double weight=1.0);
|
89
|
+
|
90
|
+
/**
|
91
|
+
* Returns the configured maximum sample size.
|
92
|
+
* @return configured maximum sample size
|
93
|
+
*/
|
94
|
+
inline uint32_t get_k() const;
|
95
|
+
|
96
|
+
/**
|
97
|
+
* Returns the length of the input stream.
|
98
|
+
* @return stream length
|
99
|
+
*/
|
100
|
+
inline uint64_t get_n() const;
|
101
|
+
|
102
|
+
/**
|
103
|
+
* Returns the number of samples currently in the sketch
|
104
|
+
* @return stream length
|
105
|
+
*/
|
106
|
+
inline uint32_t get_num_samples() const;
|
107
|
+
|
108
|
+
/**
|
109
|
+
* Computes an estimated subset sum from the entire stream for objects matching a given
|
110
|
+
* predicate. Provides a lower bound, estimate, and upper bound using a target of 2 standard
|
111
|
+
* deviations. This is technically a heuristic method and tries to err on the conservative side.
|
112
|
+
* @param P a predicate function
|
113
|
+
* @return a subset_summary item with estimate, upper and lower bounds,
|
114
|
+
* and total sketch weight
|
115
|
+
*/
|
116
|
+
template<typename P>
|
117
|
+
subset_summary estimate_subset_sum(P predicate) const;
|
118
|
+
|
119
|
+
/**
|
120
|
+
* Returns true if the sketch is empty.
|
121
|
+
* @return empty flag
|
122
|
+
*/
|
123
|
+
inline bool is_empty() const;
|
124
|
+
|
125
|
+
/**
|
126
|
+
* Resets the sketch to its default, empty state.
|
127
|
+
*/
|
128
|
+
void reset();
|
129
|
+
|
130
|
+
/**
|
131
|
+
* Computes size needed to serialize the current state of the sketch.
|
132
|
+
* This version is for fixed-size arithmetic types (integral and floating point).
|
133
|
+
* @return size in bytes needed to serialize this sketch
|
134
|
+
*/
|
135
|
+
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
136
|
+
inline size_t get_serialized_size_bytes() const;
|
137
|
+
|
138
|
+
/**
|
139
|
+
* Computes size needed to serialize the current state of the sketch.
|
140
|
+
* This version is for all other types and can be expensive since every item needs to be looked at.
|
141
|
+
* @return size in bytes needed to serialize this sketch
|
142
|
+
*/
|
143
|
+
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
144
|
+
inline size_t get_serialized_size_bytes() const;
|
145
|
+
|
146
|
+
// This is a convenience alias for users
|
147
|
+
// The type returned by the following serialize method
|
148
|
+
typedef vector_u8<A> vector_bytes;
|
149
|
+
|
150
|
+
/**
|
151
|
+
* This method serializes the sketch as a vector of bytes.
|
152
|
+
* An optional header can be reserved in front of the sketch.
|
153
|
+
* It is a blank space of a given size.
|
154
|
+
* This header is used in Datasketches PostgreSQL extension.
|
155
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
156
|
+
*/
|
157
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
158
|
+
|
159
|
+
/**
|
160
|
+
* This method serializes the sketch into a given stream in a binary form
|
161
|
+
* @param os output stream
|
162
|
+
*/
|
163
|
+
void serialize(std::ostream& os) const;
|
164
|
+
|
165
|
+
/**
|
166
|
+
* This method deserializes a sketch from a given stream.
|
167
|
+
* @param is input stream
|
168
|
+
* @return an instance of a sketch
|
169
|
+
*/
|
170
|
+
static var_opt_sketch deserialize(std::istream& is);
|
171
|
+
|
172
|
+
/**
|
173
|
+
* This method deserializes a sketch from a given array of bytes.
|
174
|
+
* @param bytes pointer to the array of bytes
|
175
|
+
* @param size the size of the array
|
176
|
+
* @return an instance of a sketch
|
177
|
+
*/
|
178
|
+
static var_opt_sketch deserialize(const void* bytes, size_t size);
|
179
|
+
|
180
|
+
/**
|
181
|
+
* Prints a summary of the sketch.
|
182
|
+
* @return the summary as a string
|
183
|
+
*/
|
184
|
+
string<A> to_string() const;
|
185
|
+
|
186
|
+
/**
|
187
|
+
* Prints the raw sketch items to a string. Calls items_to_stream() internally.
|
188
|
+
* Only works for type T with a defined operator<<() and
|
189
|
+
* kept separate from to_string() to allow compilation even if
|
190
|
+
* T does not have such an operator defined.
|
191
|
+
* @return a string with the sketch items
|
192
|
+
*/
|
193
|
+
string<A> items_to_string() const;
|
194
|
+
|
195
|
+
class const_iterator;
|
196
|
+
const_iterator begin() const;
|
197
|
+
const_iterator end() const;
|
198
|
+
|
199
|
+
private:
|
200
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
|
201
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
|
202
|
+
|
203
|
+
static const uint32_t MIN_LG_ARR_ITEMS = 3;
|
204
|
+
|
205
|
+
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
206
|
+
static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
|
207
|
+
static const uint8_t PREAMBLE_LONGS_FULL = 4;
|
208
|
+
static const uint8_t SER_VER = 2;
|
209
|
+
static const uint8_t FAMILY_ID = 13;
|
210
|
+
static const uint8_t EMPTY_FLAG_MASK = 4;
|
211
|
+
static const uint8_t GADGET_FLAG_MASK = 128;
|
212
|
+
|
213
|
+
// Number of standard deviations to use for subset sum error bounds
|
214
|
+
constexpr static const double DEFAULT_KAPPA = 2.0;
|
215
|
+
|
216
|
+
// TODO: should probably rearrange a bit to minimize gaps once aligned
|
217
|
+
uint32_t k_; // max size of sketch, in items
|
218
|
+
|
219
|
+
uint32_t h_; // number of items in heap
|
220
|
+
uint32_t m_; // number of items in middle region
|
221
|
+
uint32_t r_; // number of items in reservoir-like region
|
222
|
+
|
223
|
+
uint64_t n_; // total number of items processed by sketch
|
224
|
+
double total_wt_r_; // total weight of items in reservoir-like area
|
225
|
+
|
226
|
+
resize_factor rf_; // resize factor
|
227
|
+
|
228
|
+
uint32_t curr_items_alloc_; // currently allocated array size
|
229
|
+
bool filled_data_; // true if we've explciitly set all entries in data_
|
230
|
+
|
231
|
+
T* data_; // stored sampled items
|
232
|
+
double* weights_; // weights for sampled items
|
233
|
+
|
234
|
+
// The next two fields are hidden from the user because they are part of the state of the
|
235
|
+
// unioning algorithm, NOT part of a varopt sketch, or even of a varopt "gadget" (our name for
|
236
|
+
// the potentially invalid sketch that is maintained by the unioning algorithm). It would make
|
237
|
+
// more sense logically for these fields to be declared in the unioning object (whose entire
|
238
|
+
// purpose is storing the state of the unioning algorithm) but for reasons of programming
|
239
|
+
// convenience we are currently declaring them here. However, that could change in the future.
|
240
|
+
|
241
|
+
// Following int is:
|
242
|
+
// 1. Zero (for a varopt sketch)
|
243
|
+
// 2. Count of marked items in H region, if part of a unioning algo's gadget
|
244
|
+
uint32_t num_marks_in_h_;
|
245
|
+
|
246
|
+
// The following array is absent in a varopt sketch, and notionally present in a gadget
|
247
|
+
// (although it really belongs in the unioning object). If the array were to be made explicit,
|
248
|
+
// some additional coding would need to be done to ensure that all of the necessary data motion
|
249
|
+
// occurs and is properly tracked.
|
250
|
+
bool* marks_;
|
251
|
+
|
252
|
+
// used during deserialization to avoid memork leaks upon errors
|
253
|
+
class items_deleter;
|
254
|
+
class weights_deleter;
|
255
|
+
class marks_deleter;
|
256
|
+
|
257
|
+
var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget);
|
258
|
+
var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
|
259
|
+
uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
|
260
|
+
std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
|
261
|
+
std::unique_ptr<bool, marks_deleter> marks);
|
262
|
+
|
263
|
+
friend class var_opt_union<T,S,A>;
|
264
|
+
var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n);
|
265
|
+
var_opt_sketch(T* data, double* weights, size_t len, uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r);
|
266
|
+
|
267
|
+
string<A> items_to_string(bool print_gap) const;
|
268
|
+
|
269
|
+
// internal-use-only update
|
270
|
+
template<typename O>
|
271
|
+
inline void update(O&& item, double weight, bool mark);
|
272
|
+
|
273
|
+
template<typename O>
|
274
|
+
inline void update_warmup_phase(O&& item, double weight, bool mark);
|
275
|
+
|
276
|
+
template<typename O>
|
277
|
+
inline void update_light(O&& item, double weight, bool mark);
|
278
|
+
|
279
|
+
template<typename O>
|
280
|
+
inline void update_heavy_r_eq1(O&& item, double weight, bool mark);
|
281
|
+
|
282
|
+
template<typename O>
|
283
|
+
inline void update_heavy_general(O&& item, double weight, bool mark);
|
284
|
+
|
285
|
+
inline double get_tau() const;
|
286
|
+
inline double peek_min() const;
|
287
|
+
inline bool is_marked(uint32_t idx) const;
|
288
|
+
|
289
|
+
inline uint32_t pick_random_slot_in_r() const;
|
290
|
+
inline uint32_t choose_delete_slot(double wt_cand, uint32_t num_cand) const;
|
291
|
+
inline uint32_t choose_weighted_delete_slot(double wt_cand, uint32_t num_cand) const;
|
292
|
+
|
293
|
+
template<typename O>
|
294
|
+
inline void push(O&& item, double wt, bool mark);
|
295
|
+
inline void transition_from_warmup();
|
296
|
+
inline void convert_to_heap();
|
297
|
+
inline void restore_towards_leaves(uint32_t slot_in);
|
298
|
+
inline void restore_towards_root(uint32_t slot_in);
|
299
|
+
inline void pop_min_to_m_region();
|
300
|
+
void grow_candidate_set(double wt_cands, uint32_t num_cands);
|
301
|
+
void decrease_k_by_1();
|
302
|
+
void strip_marks();
|
303
|
+
void force_set_k(uint32_t k); // used to resolve union gadget into sketch
|
304
|
+
void downsample_candidate_set(double wt_cands, uint32_t num_cands);
|
305
|
+
inline void swap_values(uint32_t src, uint32_t dst);
|
306
|
+
void grow_data_arrays();
|
307
|
+
void allocate_data_arrays(uint32_t tgt_size, bool use_marks);
|
308
|
+
|
309
|
+
// validation
|
310
|
+
static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
|
311
|
+
static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
|
312
|
+
static uint32_t validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
|
313
|
+
uint32_t h, uint32_t r, resize_factor rf);
|
314
|
+
|
315
|
+
// things to move to common and be shared among sketches
|
316
|
+
static uint32_t get_adjusted_size(uint32_t max_size, uint32_t resize_target);
|
317
|
+
static uint32_t starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min);
|
318
|
+
static inline double pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate);
|
319
|
+
static inline double pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate);
|
320
|
+
static bool is_power_of_2(uint32_t v);
|
321
|
+
static uint32_t to_log_2(uint32_t v);
|
322
|
+
static inline uint32_t next_int(uint32_t max_value);
|
323
|
+
static inline double next_double_exclude_zero();
|
324
|
+
|
325
|
+
class iterator;
|
326
|
+
};
|
327
|
+
|
328
|
+
template<typename T, typename S, typename A>
|
329
|
+
class var_opt_sketch<T, S, A>::const_iterator : public std::iterator<std::input_iterator_tag, T> {
|
330
|
+
public:
|
331
|
+
const_iterator(const const_iterator& other);
|
332
|
+
const_iterator& operator++();
|
333
|
+
const_iterator& operator++(int);
|
334
|
+
bool operator==(const const_iterator& other) const;
|
335
|
+
bool operator!=(const const_iterator& other) const;
|
336
|
+
const std::pair<const T&, const double> operator*() const;
|
337
|
+
|
338
|
+
private:
|
339
|
+
friend class var_opt_sketch<T,S,A>;
|
340
|
+
friend class var_opt_union<T,S,A>;
|
341
|
+
|
342
|
+
// default iterator over full sketch
|
343
|
+
const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end);
|
344
|
+
|
345
|
+
// iterates over only one of the H or R region, optionally applying weight correction
|
346
|
+
// to R region (can correct for numerical precision issues)
|
347
|
+
const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region);
|
348
|
+
|
349
|
+
bool get_mark() const;
|
350
|
+
|
351
|
+
const var_opt_sketch<T,S,A>* sk_;
|
352
|
+
double cum_r_weight_; // used for weight correction
|
353
|
+
double r_item_wt_;
|
354
|
+
size_t idx_;
|
355
|
+
const size_t final_idx_;
|
356
|
+
bool weight_correction_;
|
357
|
+
};
|
358
|
+
|
359
|
+
// non-const iterator for internal use
|
360
|
+
template<typename T, typename S, typename A>
|
361
|
+
class var_opt_sketch<T, S, A>::iterator : public std::iterator<std::input_iterator_tag, T> {
|
362
|
+
public:
|
363
|
+
iterator(const iterator& other);
|
364
|
+
iterator& operator++();
|
365
|
+
iterator& operator++(int);
|
366
|
+
bool operator==(const iterator& other) const;
|
367
|
+
bool operator!=(const iterator& other) const;
|
368
|
+
std::pair<T&, double> operator*();
|
369
|
+
|
370
|
+
private:
|
371
|
+
friend class var_opt_sketch<T,S,A>;
|
372
|
+
friend class var_opt_union<T,S,A>;
|
373
|
+
|
374
|
+
// iterates over only one of the H or R region, applying weight correction
|
375
|
+
// if iterating over R region (can correct for numerical precision issues)
|
376
|
+
iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region);
|
377
|
+
|
378
|
+
bool get_mark() const;
|
379
|
+
|
380
|
+
const var_opt_sketch<T,S,A>* sk_;
|
381
|
+
double cum_r_weight_; // used for weight correction
|
382
|
+
double r_item_wt_;
|
383
|
+
size_t idx_;
|
384
|
+
const size_t final_idx_;
|
385
|
+
};
|
386
|
+
|
387
|
+
|
388
|
+
} // namespace datasketches
|
389
|
+
|
390
|
+
#include "var_opt_sketch_impl.hpp"
|
391
|
+
|
392
|
+
#endif // _VAR_OPT_SKETCH_HPP_
|
@@ -0,0 +1,1752 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _VAR_OPT_SKETCH_IMPL_HPP_
|
21
|
+
#define _VAR_OPT_SKETCH_IMPL_HPP_
|
22
|
+
|
23
|
+
#include <memory>
|
24
|
+
#include <sstream>
|
25
|
+
#include <cmath>
|
26
|
+
#include <random>
|
27
|
+
#include <algorithm>
|
28
|
+
|
29
|
+
#include "var_opt_sketch.hpp"
|
30
|
+
#include "serde.hpp"
|
31
|
+
#include "bounds_binomial_proportions.hpp"
|
32
|
+
#include "count_zeros.hpp"
|
33
|
+
#include "memory_operations.hpp"
|
34
|
+
#include "ceiling_power_of_2.hpp"
|
35
|
+
|
36
|
+
namespace datasketches {
|
37
|
+
|
38
|
+
/**
|
39
|
+
* Implementation code for the VarOpt sketch.
|
40
|
+
*
|
41
|
+
* author Kevin Lang
|
42
|
+
* author Jon Malkin
|
43
|
+
*/
|
44
|
+
template<typename T, typename S, typename A>
|
45
|
+
var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf) :
|
46
|
+
var_opt_sketch<T,S,A>(k, rf, false) {}
|
47
|
+
|
48
|
+
template<typename T, typename S, typename A>
|
49
|
+
var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other) :
|
50
|
+
k_(other.k_),
|
51
|
+
h_(other.h_),
|
52
|
+
m_(other.m_),
|
53
|
+
r_(other.r_),
|
54
|
+
n_(other.n_),
|
55
|
+
total_wt_r_(other.total_wt_r_),
|
56
|
+
rf_(other.rf_),
|
57
|
+
curr_items_alloc_(other.curr_items_alloc_),
|
58
|
+
filled_data_(other.filled_data_),
|
59
|
+
data_(nullptr),
|
60
|
+
weights_(nullptr),
|
61
|
+
num_marks_in_h_(other.num_marks_in_h_),
|
62
|
+
marks_(nullptr)
|
63
|
+
{
|
64
|
+
data_ = A().allocate(curr_items_alloc_);
|
65
|
+
// skip gap or anything unused at the end
|
66
|
+
for (size_t i = 0; i < h_; ++i)
|
67
|
+
new (&data_[i]) T(other.data_[i]);
|
68
|
+
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
|
69
|
+
new (&data_[i]) T(other.data_[i]);
|
70
|
+
|
71
|
+
// we skipped the gap
|
72
|
+
filled_data_ = false;
|
73
|
+
|
74
|
+
weights_ = AllocDouble().allocate(curr_items_alloc_);
|
75
|
+
// doubles so can successfully copy regardless of the internal state
|
76
|
+
std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
|
77
|
+
|
78
|
+
if (other.marks_ != nullptr) {
|
79
|
+
marks_ = AllocBool().allocate(curr_items_alloc_);
|
80
|
+
std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
template<typename T, typename S, typename A>
|
85
|
+
var_opt_sketch<T,S,A>::var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n) :
|
86
|
+
k_(other.k_),
|
87
|
+
h_(other.h_),
|
88
|
+
m_(other.m_),
|
89
|
+
r_(other.r_),
|
90
|
+
n_(adjusted_n),
|
91
|
+
total_wt_r_(other.total_wt_r_),
|
92
|
+
rf_(other.rf_),
|
93
|
+
curr_items_alloc_(other.curr_items_alloc_),
|
94
|
+
filled_data_(other.filled_data_),
|
95
|
+
data_(nullptr),
|
96
|
+
weights_(nullptr),
|
97
|
+
num_marks_in_h_(other.num_marks_in_h_),
|
98
|
+
marks_(nullptr)
|
99
|
+
{
|
100
|
+
data_ = A().allocate(curr_items_alloc_);
|
101
|
+
// skip gap or anything unused at the end
|
102
|
+
for (size_t i = 0; i < h_; ++i)
|
103
|
+
new (&data_[i]) T(other.data_[i]);
|
104
|
+
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
|
105
|
+
new (&data_[i]) T(other.data_[i]);
|
106
|
+
|
107
|
+
// we skipped the gap
|
108
|
+
filled_data_ = false;
|
109
|
+
|
110
|
+
weights_ = AllocDouble().allocate(curr_items_alloc_);
|
111
|
+
// doubles so can successfully copy regardless of the internal state
|
112
|
+
std::copy(&other.weights_[0], &other.weights_[curr_items_alloc_], weights_);
|
113
|
+
|
114
|
+
if (!as_sketch && other.marks_ != nullptr) {
|
115
|
+
marks_ = AllocBool().allocate(curr_items_alloc_);
|
116
|
+
std::copy(&other.marks_[0], &other.marks_[curr_items_alloc_], marks_);
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
template<typename T, typename S, typename A>
|
121
|
+
var_opt_sketch<T,S,A>::var_opt_sketch(T* data, double* weights, size_t len,
|
122
|
+
uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r) :
|
123
|
+
k_(k),
|
124
|
+
h_(h_count),
|
125
|
+
m_(0),
|
126
|
+
r_(r_count),
|
127
|
+
n_(n),
|
128
|
+
total_wt_r_(total_wt_r),
|
129
|
+
rf_(DEFAULT_RESIZE_FACTOR),
|
130
|
+
curr_items_alloc_(len),
|
131
|
+
filled_data_(n > k),
|
132
|
+
data_(data),
|
133
|
+
weights_(weights),
|
134
|
+
num_marks_in_h_(0),
|
135
|
+
marks_(nullptr)
|
136
|
+
{}
|
137
|
+
|
138
|
+
template<typename T, typename S, typename A>
|
139
|
+
var_opt_sketch<T,S,A>::var_opt_sketch(var_opt_sketch&& other) noexcept :
|
140
|
+
k_(other.k_),
|
141
|
+
h_(other.h_),
|
142
|
+
m_(other.m_),
|
143
|
+
r_(other.r_),
|
144
|
+
n_(other.n_),
|
145
|
+
total_wt_r_(other.total_wt_r_),
|
146
|
+
rf_(other.rf_),
|
147
|
+
curr_items_alloc_(other.curr_items_alloc_),
|
148
|
+
filled_data_(other.filled_data_),
|
149
|
+
data_(other.data_),
|
150
|
+
weights_(other.weights_),
|
151
|
+
num_marks_in_h_(other.num_marks_in_h_),
|
152
|
+
marks_(other.marks_)
|
153
|
+
{
|
154
|
+
other.data_ = nullptr;
|
155
|
+
other.weights_ = nullptr;
|
156
|
+
other.marks_ = nullptr;
|
157
|
+
}
|
158
|
+
|
159
|
+
template<typename T, typename S, typename A>
|
160
|
+
var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget) :
|
161
|
+
k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf) {
|
162
|
+
if (k == 0 || k_ > MAX_K) {
|
163
|
+
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
164
|
+
}
|
165
|
+
|
166
|
+
uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
|
167
|
+
uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS);
|
168
|
+
curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
|
169
|
+
if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
|
170
|
+
++curr_items_alloc_;
|
171
|
+
}
|
172
|
+
|
173
|
+
allocate_data_arrays(curr_items_alloc_, is_gadget);
|
174
|
+
num_marks_in_h_ = 0;
|
175
|
+
}
|
176
|
+
|
177
|
+
template<typename T, typename S, typename A>
|
178
|
+
var_opt_sketch<T,S,A>::var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf,
|
179
|
+
uint32_t curr_items_alloc, bool filled_data, std::unique_ptr<T, items_deleter> items,
|
180
|
+
std::unique_ptr<double, weights_deleter> weights, uint32_t num_marks_in_h,
|
181
|
+
std::unique_ptr<bool, marks_deleter> marks) :
|
182
|
+
k_(k),
|
183
|
+
h_(h),
|
184
|
+
m_(m),
|
185
|
+
r_(r),
|
186
|
+
n_(n),
|
187
|
+
total_wt_r_(total_wt_r),
|
188
|
+
rf_(rf),
|
189
|
+
curr_items_alloc_(curr_items_alloc),
|
190
|
+
filled_data_(filled_data),
|
191
|
+
data_(items.release()),
|
192
|
+
weights_(weights.release()),
|
193
|
+
num_marks_in_h_(num_marks_in_h),
|
194
|
+
marks_(marks.release())
|
195
|
+
{}
|
196
|
+
|
197
|
+
|
198
|
+
template<typename T, typename S, typename A>
|
199
|
+
var_opt_sketch<T,S,A>::~var_opt_sketch() {
|
200
|
+
if (data_ != nullptr) {
|
201
|
+
if (filled_data_) {
|
202
|
+
// destroy everything
|
203
|
+
const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_);
|
204
|
+
for (size_t i = 0; i < num_to_destroy; ++i) {
|
205
|
+
A().destroy(data_ + i);
|
206
|
+
}
|
207
|
+
} else {
|
208
|
+
// skip gap or anything unused at the end
|
209
|
+
for (size_t i = 0; i < h_; ++i) {
|
210
|
+
A().destroy(data_+ i);
|
211
|
+
}
|
212
|
+
|
213
|
+
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) {
|
214
|
+
A().destroy(data_ + i);
|
215
|
+
}
|
216
|
+
}
|
217
|
+
A().deallocate(data_, curr_items_alloc_);
|
218
|
+
}
|
219
|
+
|
220
|
+
if (weights_ != nullptr) {
|
221
|
+
AllocDouble().deallocate(weights_, curr_items_alloc_);
|
222
|
+
}
|
223
|
+
|
224
|
+
if (marks_ != nullptr) {
|
225
|
+
AllocBool().deallocate(marks_, curr_items_alloc_);
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
template<typename T, typename S, typename A>
|
230
|
+
var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(const var_opt_sketch& other) {
|
231
|
+
var_opt_sketch<T,S,A> sk_copy(other);
|
232
|
+
std::swap(k_, sk_copy.k_);
|
233
|
+
std::swap(h_, sk_copy.h_);
|
234
|
+
std::swap(m_, sk_copy.m_);
|
235
|
+
std::swap(r_, sk_copy.r_);
|
236
|
+
std::swap(n_, sk_copy.n_);
|
237
|
+
std::swap(total_wt_r_, sk_copy.total_wt_r_);
|
238
|
+
std::swap(rf_, sk_copy.rf_);
|
239
|
+
std::swap(curr_items_alloc_, sk_copy.curr_items_alloc_);
|
240
|
+
std::swap(filled_data_, sk_copy.filled_data_);
|
241
|
+
std::swap(data_, sk_copy.data_);
|
242
|
+
std::swap(weights_, sk_copy.weights_);
|
243
|
+
std::swap(num_marks_in_h_, sk_copy.num_marks_in_h_);
|
244
|
+
std::swap(marks_, sk_copy.marks_);
|
245
|
+
return *this;
|
246
|
+
}
|
247
|
+
|
248
|
+
template<typename T, typename S, typename A>
|
249
|
+
var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(var_opt_sketch&& other) {
|
250
|
+
std::swap(k_, other.k_);
|
251
|
+
std::swap(h_, other.h_);
|
252
|
+
std::swap(m_, other.m_);
|
253
|
+
std::swap(r_, other.r_);
|
254
|
+
std::swap(n_, other.n_);
|
255
|
+
std::swap(total_wt_r_, other.total_wt_r_);
|
256
|
+
std::swap(rf_, other.rf_);
|
257
|
+
std::swap(curr_items_alloc_, other.curr_items_alloc_);
|
258
|
+
std::swap(filled_data_, other.filled_data_);
|
259
|
+
std::swap(data_, other.data_);
|
260
|
+
std::swap(weights_, other.weights_);
|
261
|
+
std::swap(num_marks_in_h_, other.num_marks_in_h_);
|
262
|
+
std::swap(marks_, other.marks_);
|
263
|
+
return *this;
|
264
|
+
}
|
265
|
+
|
266
|
+
/*
|
267
|
+
* An empty sketch requires 8 bytes.
|
268
|
+
*
|
269
|
+
* <pre>
|
270
|
+
* Long || Start Byte Adr:
|
271
|
+
* Adr:
|
272
|
+
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
273
|
+
* 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
|
274
|
+
* </pre>
|
275
|
+
*
|
276
|
+
* A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are
|
277
|
+
* at least k items the sketch uses 32 bytes of preamble.
|
278
|
+
*
|
279
|
+
* The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent
|
280
|
+
* unused preamble bits. The acceptance probability for an item is a double in the range [0,1),
|
281
|
+
* limiting us to 53 bits of randomness due to details of the IEEE floating point format. To
|
282
|
+
* ensure meaningful probabilities as the items seen count approaches capacity, we intentionally
|
283
|
+
* use slightly fewer bits.
|
284
|
+
*
|
285
|
+
* Following the header are weights for the heavy items, then marks in the event this is a gadget.
|
286
|
+
* The serialized items come last.
|
287
|
+
*
|
288
|
+
* <pre>
|
289
|
+
* Long || Start Byte Adr:
|
290
|
+
* Adr:
|
291
|
+
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
292
|
+
* 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
|
293
|
+
*
|
294
|
+
* || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
|
295
|
+
* 1 ||---------------------------Items Seen Count (N)--------------------------------|
|
296
|
+
*
|
297
|
+
* || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|
298
|
+
* 2 ||-------------Item Count in H---------------|-------Item Count in R-------------|
|
299
|
+
*
|
300
|
+
* || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
|
301
|
+
* 3 ||-------------------------------Total Weight in R-------------------------------|
|
302
|
+
* </pre>
|
303
|
+
*/
|
304
|
+
|
305
|
+
// implementation for fixed-size arithmetic types (integral and floating point)
|
306
|
+
template<typename T, typename S, typename A>
|
307
|
+
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
308
|
+
size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
|
309
|
+
if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
|
310
|
+
size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
|
311
|
+
num_bytes += h_ * sizeof(double); // weights
|
312
|
+
if (marks_ != nullptr) { // marks
|
313
|
+
num_bytes += (h_ / 8) + (h_ % 8 > 0);
|
314
|
+
}
|
315
|
+
num_bytes += (h_ + r_) * sizeof(TT); // the actual items
|
316
|
+
return num_bytes;
|
317
|
+
}
|
318
|
+
|
319
|
+
// implementation for all other types
|
320
|
+
template<typename T, typename S, typename A>
|
321
|
+
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
322
|
+
size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
|
323
|
+
if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
|
324
|
+
size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
|
325
|
+
num_bytes += h_ * sizeof(double); // weights
|
326
|
+
if (marks_ != nullptr) { // marks
|
327
|
+
num_bytes += (h_ / 8) + (h_ % 8 > 0);
|
328
|
+
}
|
329
|
+
// must iterate over the items
|
330
|
+
for (auto& it: *this)
|
331
|
+
num_bytes += S().size_of_item(it.first);
|
332
|
+
return num_bytes;
|
333
|
+
}
|
334
|
+
|
335
|
+
template<typename T, typename S, typename A>
|
336
|
+
std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes) const {
|
337
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
338
|
+
std::vector<uint8_t, AllocU8<A>> bytes(size);
|
339
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
340
|
+
uint8_t* end_ptr = ptr + size;
|
341
|
+
|
342
|
+
bool empty = is_empty();
|
343
|
+
uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
|
344
|
+
: (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL));
|
345
|
+
uint8_t first_byte = (preLongs & 0x3F) | ((static_cast<uint8_t>(rf_)) << 6);
|
346
|
+
uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0);
|
347
|
+
|
348
|
+
if (empty) {
|
349
|
+
flags |= EMPTY_FLAG_MASK;
|
350
|
+
}
|
351
|
+
|
352
|
+
// first prelong
|
353
|
+
uint8_t ser_ver(SER_VER);
|
354
|
+
uint8_t family(FAMILY_ID);
|
355
|
+
ptr += copy_to_mem(&first_byte, ptr, sizeof(uint8_t));
|
356
|
+
ptr += copy_to_mem(&ser_ver, ptr, sizeof(uint8_t));
|
357
|
+
ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
|
358
|
+
ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
|
359
|
+
ptr += copy_to_mem(&k_, ptr, sizeof(uint32_t));
|
360
|
+
|
361
|
+
if (!empty) {
|
362
|
+
// second and third prelongs
|
363
|
+
ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
|
364
|
+
ptr += copy_to_mem(&h_, ptr, sizeof(uint32_t));
|
365
|
+
ptr += copy_to_mem(&r_, ptr, sizeof(uint32_t));
|
366
|
+
|
367
|
+
// fourth prelong, if needed
|
368
|
+
if (r_ > 0) {
|
369
|
+
ptr += copy_to_mem(&total_wt_r_, ptr, sizeof(double));
|
370
|
+
}
|
371
|
+
|
372
|
+
// first h_ weights
|
373
|
+
ptr += copy_to_mem(weights_, ptr, h_ * sizeof(double));
|
374
|
+
|
375
|
+
// first h_ marks as packed bytes iff we have a gadget
|
376
|
+
if (marks_ != nullptr) {
|
377
|
+
uint8_t val = 0;
|
378
|
+
for (uint32_t i = 0; i < h_; ++i) {
|
379
|
+
if (marks_[i]) {
|
380
|
+
val |= 0x1 << (i & 0x7);
|
381
|
+
}
|
382
|
+
|
383
|
+
if ((i & 0x7) == 0x7) {
|
384
|
+
ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
|
385
|
+
val = 0;
|
386
|
+
}
|
387
|
+
}
|
388
|
+
|
389
|
+
// write out any remaining values
|
390
|
+
if ((h_ & 0x7) > 0) {
|
391
|
+
ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
|
392
|
+
}
|
393
|
+
}
|
394
|
+
|
395
|
+
// write the sample items, skipping the gap. Either h_ or r_ may be 0
|
396
|
+
ptr += S().serialize(ptr, end_ptr - ptr, data_, h_);
|
397
|
+
ptr += S().serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
|
398
|
+
}
|
399
|
+
|
400
|
+
size_t bytes_written = ptr - bytes.data();
|
401
|
+
if (bytes_written != size) {
|
402
|
+
throw std::logic_error("serialized size mismatch: " + std::to_string(bytes_written) + " != " + std::to_string(size));
|
403
|
+
}
|
404
|
+
|
405
|
+
return bytes;
|
406
|
+
}
|
407
|
+
|
408
|
+
template<typename T, typename S, typename A>
|
409
|
+
void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
|
410
|
+
const bool empty = (h_ == 0) && (r_ == 0);
|
411
|
+
|
412
|
+
const uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
|
413
|
+
: (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL));
|
414
|
+
const uint8_t first_byte = (preLongs & 0x3F) | ((static_cast<uint8_t>(rf_)) << 6);
|
415
|
+
uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0);
|
416
|
+
|
417
|
+
if (empty) {
|
418
|
+
flags |= EMPTY_FLAG_MASK;
|
419
|
+
}
|
420
|
+
|
421
|
+
// first prelong
|
422
|
+
const uint8_t ser_ver(SER_VER);
|
423
|
+
const uint8_t family(FAMILY_ID);
|
424
|
+
os.write((char*)&first_byte, sizeof(uint8_t));
|
425
|
+
os.write((char*)&ser_ver, sizeof(uint8_t));
|
426
|
+
os.write((char*)&family, sizeof(uint8_t));
|
427
|
+
os.write((char*)&flags, sizeof(uint8_t));
|
428
|
+
os.write((char*)&k_, sizeof(uint32_t));
|
429
|
+
|
430
|
+
if (!empty) {
|
431
|
+
// second and third prelongs
|
432
|
+
os.write((char*)&n_, sizeof(uint64_t));
|
433
|
+
os.write((char*)&h_, sizeof(uint32_t));
|
434
|
+
os.write((char*)&r_, sizeof(uint32_t));
|
435
|
+
|
436
|
+
// fourth prelong, if needed
|
437
|
+
if (r_ > 0) {
|
438
|
+
os.write((char*)&total_wt_r_, sizeof(double));
|
439
|
+
}
|
440
|
+
|
441
|
+
// write the first h_ weights
|
442
|
+
os.write((char*)weights_, h_ * sizeof(double));
|
443
|
+
|
444
|
+
// write the first h_ marks as packed bytes iff we have a gadget
|
445
|
+
if (marks_ != nullptr) {
|
446
|
+
uint8_t val = 0;
|
447
|
+
for (uint32_t i = 0; i < h_; ++i) {
|
448
|
+
if (marks_[i]) {
|
449
|
+
val |= 0x1 << (i & 0x7);
|
450
|
+
}
|
451
|
+
|
452
|
+
if ((i & 0x7) == 0x7) {
|
453
|
+
os.write((char*)&val, sizeof(uint8_t));
|
454
|
+
val = 0;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
|
458
|
+
// write out any remaining values
|
459
|
+
if ((h_ & 0x7) > 0) {
|
460
|
+
os.write((char*)&val, sizeof(uint8_t));
|
461
|
+
}
|
462
|
+
}
|
463
|
+
|
464
|
+
// write the sample items, skipping the gap. Either h_ or r_ may be 0
|
465
|
+
S().serialize(os, data_, h_);
|
466
|
+
S().serialize(os, &data_[h_ + 1], r_);
|
467
|
+
}
|
468
|
+
}
|
469
|
+
|
470
|
+
template<typename T, typename S, typename A>
|
471
|
+
var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size) {
|
472
|
+
ensure_minimum_memory(size, 8);
|
473
|
+
const char* ptr = static_cast<const char*>(bytes);
|
474
|
+
const char* base = ptr;
|
475
|
+
const char* end_ptr = ptr + size;
|
476
|
+
uint8_t first_byte;
|
477
|
+
ptr += copy_from_mem(ptr, &first_byte, sizeof(first_byte));
|
478
|
+
uint8_t preamble_longs = first_byte & 0x3f;
|
479
|
+
resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
480
|
+
uint8_t serial_version;
|
481
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
482
|
+
uint8_t family_id;
|
483
|
+
ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
|
484
|
+
uint8_t flags;
|
485
|
+
ptr += copy_from_mem(ptr, &flags, sizeof(flags));
|
486
|
+
uint32_t k;
|
487
|
+
ptr += copy_from_mem(ptr, &k, sizeof(k));
|
488
|
+
|
489
|
+
check_preamble_longs(preamble_longs, flags);
|
490
|
+
check_family_and_serialization_version(family_id, serial_version);
|
491
|
+
ensure_minimum_memory(size, preamble_longs << 3);
|
492
|
+
|
493
|
+
const bool is_empty = flags & EMPTY_FLAG_MASK;
|
494
|
+
const bool is_gadget = flags & GADGET_FLAG_MASK;
|
495
|
+
|
496
|
+
if (is_empty) {
|
497
|
+
return var_opt_sketch<T,S,A>(k, rf, is_gadget);
|
498
|
+
}
|
499
|
+
|
500
|
+
// second and third prelongs
|
501
|
+
uint64_t n;
|
502
|
+
uint32_t h, r;
|
503
|
+
ptr += copy_from_mem(ptr, &n, sizeof(n));
|
504
|
+
ptr += copy_from_mem(ptr, &h, sizeof(h));
|
505
|
+
ptr += copy_from_mem(ptr, &r, sizeof(r));
|
506
|
+
|
507
|
+
const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
|
508
|
+
|
509
|
+
// current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
|
510
|
+
double total_wt_r = 0.0;
|
511
|
+
if (preamble_longs == PREAMBLE_LONGS_FULL) {
|
512
|
+
ptr += copy_from_mem(ptr, &total_wt_r, sizeof(total_wt_r));
|
513
|
+
if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
|
514
|
+
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
|
515
|
+
"Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
|
516
|
+
}
|
517
|
+
} else {
|
518
|
+
total_wt_r = 0.0;
|
519
|
+
}
|
520
|
+
|
521
|
+
// read the first h_ weights, fill in rest of array with -1.0
|
522
|
+
check_memory_size(ptr - base + (h * sizeof(double)), size);
|
523
|
+
std::unique_ptr<double, weights_deleter> weights(AllocDouble().allocate(array_size), weights_deleter(array_size));
|
524
|
+
double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
|
525
|
+
ptr += copy_from_mem(ptr, wts, h * sizeof(double));
|
526
|
+
for (size_t i = 0; i < h; ++i) {
|
527
|
+
if (!(wts[i] > 0.0)) {
|
528
|
+
throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
|
529
|
+
}
|
530
|
+
}
|
531
|
+
std::fill(&wts[h], &wts[array_size], -1.0);
|
532
|
+
|
533
|
+
// read the first h_ marks as packed bytes iff we have a gadget
|
534
|
+
uint32_t num_marks_in_h = 0;
|
535
|
+
std::unique_ptr<bool, marks_deleter> marks(nullptr, marks_deleter(array_size));
|
536
|
+
if (is_gadget) {
|
537
|
+
uint8_t val = 0;
|
538
|
+
marks = std::unique_ptr<bool, marks_deleter>(AllocBool().allocate(array_size), marks_deleter(array_size));
|
539
|
+
const size_t size_marks = (h / 8) + (h % 8 > 0 ? 1 : 0);
|
540
|
+
check_memory_size(ptr - base + size_marks, size);
|
541
|
+
for (uint32_t i = 0; i < h; ++i) {
|
542
|
+
if ((i & 0x7) == 0x0) { // should trigger on first iteration
|
543
|
+
ptr += copy_from_mem(ptr, &val, sizeof(val));
|
544
|
+
}
|
545
|
+
marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
|
546
|
+
num_marks_in_h += (marks.get()[i] ? 1 : 0);
|
547
|
+
}
|
548
|
+
}
|
549
|
+
|
550
|
+
// read the sample items, skipping the gap. Either h_ or r_ may be 0
|
551
|
+
items_deleter deleter(array_size);
|
552
|
+
std::unique_ptr<T, items_deleter> items(A().allocate(array_size), deleter);
|
553
|
+
|
554
|
+
ptr += S().deserialize(ptr, end_ptr - ptr, items.get(), h);
|
555
|
+
items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
|
556
|
+
|
557
|
+
ptr += S().deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
|
558
|
+
items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
|
559
|
+
|
560
|
+
return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
|
561
|
+
std::move(items), std::move(weights), num_marks_in_h, std::move(marks));
|
562
|
+
}
|
563
|
+
|
564
|
+
template<typename T, typename S, typename A>
|
565
|
+
var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is) {
|
566
|
+
uint8_t first_byte;
|
567
|
+
is.read((char*)&first_byte, sizeof(first_byte));
|
568
|
+
uint8_t preamble_longs = first_byte & 0x3f;
|
569
|
+
resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
570
|
+
uint8_t serial_version;
|
571
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
572
|
+
uint8_t family_id;
|
573
|
+
is.read((char*)&family_id, sizeof(family_id));
|
574
|
+
uint8_t flags;
|
575
|
+
is.read((char*)&flags, sizeof(flags));
|
576
|
+
uint32_t k;
|
577
|
+
is.read((char*)&k, sizeof(k));
|
578
|
+
|
579
|
+
check_preamble_longs(preamble_longs, flags);
|
580
|
+
check_family_and_serialization_version(family_id, serial_version);
|
581
|
+
|
582
|
+
const bool is_empty = flags & EMPTY_FLAG_MASK;
|
583
|
+
const bool is_gadget = flags & GADGET_FLAG_MASK;
|
584
|
+
|
585
|
+
if (is_empty) {
|
586
|
+
if (!is.good())
|
587
|
+
throw std::runtime_error("error reading from std::istream");
|
588
|
+
else
|
589
|
+
return var_opt_sketch<T,S,A>(k, rf, is_gadget);
|
590
|
+
}
|
591
|
+
|
592
|
+
// second and third prelongs
|
593
|
+
uint64_t n;
|
594
|
+
uint32_t h, r;
|
595
|
+
is.read((char*)&n, sizeof(n));
|
596
|
+
is.read((char*)&h, sizeof(h));
|
597
|
+
is.read((char*)&r, sizeof(r));
|
598
|
+
|
599
|
+
const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
|
600
|
+
|
601
|
+
// current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
|
602
|
+
double total_wt_r = 0.0;
|
603
|
+
if (preamble_longs == PREAMBLE_LONGS_FULL) {
|
604
|
+
is.read((char*)&total_wt_r, sizeof(total_wt_r));
|
605
|
+
if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
|
606
|
+
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
|
607
|
+
"Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
|
608
|
+
}
|
609
|
+
} else {
|
610
|
+
total_wt_r = 0.0;
|
611
|
+
}
|
612
|
+
|
613
|
+
// read the first h weights, fill remainder with -1.0
|
614
|
+
std::unique_ptr<double, weights_deleter> weights(AllocDouble().allocate(array_size), weights_deleter(array_size));
|
615
|
+
double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
|
616
|
+
is.read((char*)wts, h * sizeof(double));
|
617
|
+
for (size_t i = 0; i < h; ++i) {
|
618
|
+
if (!(wts[i] > 0.0)) {
|
619
|
+
throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
|
620
|
+
}
|
621
|
+
}
|
622
|
+
std::fill(&wts[h], &wts[array_size], -1.0);
|
623
|
+
|
624
|
+
// read the first h_ marks as packed bytes iff we have a gadget
|
625
|
+
uint32_t num_marks_in_h = 0;
|
626
|
+
std::unique_ptr<bool, marks_deleter> marks(nullptr, marks_deleter(array_size));
|
627
|
+
if (is_gadget) {
|
628
|
+
marks = std::unique_ptr<bool, marks_deleter>(AllocBool().allocate(array_size), marks_deleter(array_size));
|
629
|
+
uint8_t val = 0;
|
630
|
+
for (uint32_t i = 0; i < h; ++i) {
|
631
|
+
if ((i & 0x7) == 0x0) { // should trigger on first iteration
|
632
|
+
is.read((char*)&val, sizeof(val));
|
633
|
+
}
|
634
|
+
marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
|
635
|
+
num_marks_in_h += (marks.get()[i] ? 1 : 0);
|
636
|
+
}
|
637
|
+
}
|
638
|
+
|
639
|
+
// read the sample items, skipping the gap. Either h or r may be 0
|
640
|
+
items_deleter deleter(array_size);
|
641
|
+
std::unique_ptr<T, items_deleter> items(A().allocate(array_size), deleter);
|
642
|
+
|
643
|
+
S().deserialize(is, items.get(), h); // aka &data_[0]
|
644
|
+
items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
|
645
|
+
|
646
|
+
S().deserialize(is, &(items.get()[h + 1]), r);
|
647
|
+
items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
|
648
|
+
|
649
|
+
if (!is.good())
|
650
|
+
throw std::runtime_error("error reading from std::istream");
|
651
|
+
|
652
|
+
return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
|
653
|
+
std::move(items), std::move(weights), num_marks_in_h, std::move(marks));
|
654
|
+
}
|
655
|
+
|
656
|
+
template<typename T, typename S, typename A>
|
657
|
+
bool var_opt_sketch<T,S,A>::is_empty() const {
|
658
|
+
return (h_ == 0 && r_ == 0);
|
659
|
+
}
|
660
|
+
|
661
|
+
template<typename T, typename S, typename A>
|
662
|
+
void var_opt_sketch<T,S,A>::reset() {
|
663
|
+
const uint32_t prev_alloc = curr_items_alloc_;
|
664
|
+
const uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_));
|
665
|
+
const uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS);
|
666
|
+
curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size);
|
667
|
+
if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap
|
668
|
+
++curr_items_alloc_;
|
669
|
+
}
|
670
|
+
|
671
|
+
if (filled_data_) {
|
672
|
+
// destroy everything
|
673
|
+
const size_t num_to_destroy = std::min(k_ + 1, prev_alloc);
|
674
|
+
for (size_t i = 0; i < num_to_destroy; ++i)
|
675
|
+
A().destroy(data_ + i);
|
676
|
+
} else {
|
677
|
+
// skip gap or anything unused at the end
|
678
|
+
for (size_t i = 0; i < h_; ++i)
|
679
|
+
A().destroy(data_+ i);
|
680
|
+
|
681
|
+
for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i)
|
682
|
+
A().destroy(data_ + i);
|
683
|
+
}
|
684
|
+
|
685
|
+
if (curr_items_alloc_ < prev_alloc) {
|
686
|
+
const bool is_gadget = (marks_ != nullptr);
|
687
|
+
|
688
|
+
A().deallocate(data_, prev_alloc);
|
689
|
+
AllocDouble().deallocate(weights_, prev_alloc);
|
690
|
+
|
691
|
+
if (marks_ != nullptr)
|
692
|
+
AllocBool().deallocate(marks_, prev_alloc);
|
693
|
+
|
694
|
+
allocate_data_arrays(curr_items_alloc_, is_gadget);
|
695
|
+
}
|
696
|
+
|
697
|
+
n_ = 0;
|
698
|
+
h_ = 0;
|
699
|
+
m_ = 0;
|
700
|
+
r_ = 0;
|
701
|
+
num_marks_in_h_ = 0;
|
702
|
+
total_wt_r_ = 0.0;
|
703
|
+
filled_data_ = false;
|
704
|
+
}
|
705
|
+
|
706
|
+
template<typename T, typename S, typename A>
|
707
|
+
uint64_t var_opt_sketch<T,S,A>::get_n() const {
|
708
|
+
return n_;
|
709
|
+
}
|
710
|
+
|
711
|
+
template<typename T, typename S, typename A>
|
712
|
+
uint32_t var_opt_sketch<T,S,A>::get_k() const {
|
713
|
+
return k_;
|
714
|
+
}
|
715
|
+
|
716
|
+
template<typename T, typename S, typename A>
|
717
|
+
uint32_t var_opt_sketch<T,S,A>::get_num_samples() const {
|
718
|
+
const uint32_t num_in_sketch = h_ + r_;
|
719
|
+
return (num_in_sketch < k_ ? num_in_sketch : k_);
|
720
|
+
}
|
721
|
+
|
722
|
+
template<typename T, typename S, typename A>
|
723
|
+
void var_opt_sketch<T,S,A>::update(const T& item, double weight) {
|
724
|
+
update(item, weight, false);
|
725
|
+
}
|
726
|
+
|
727
|
+
template<typename T, typename S, typename A>
|
728
|
+
void var_opt_sketch<T,S,A>::update(T&& item, double weight) {
|
729
|
+
update(std::move(item), weight, false);
|
730
|
+
}
|
731
|
+
|
732
|
+
template<typename T, typename S, typename A>
|
733
|
+
string<A> var_opt_sketch<T,S,A>::to_string() const {
|
734
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
735
|
+
os << "### VarOpt SUMMARY: " << std::endl;
|
736
|
+
os << " k : " << k_ << std::endl;
|
737
|
+
os << " h : " << h_ << std::endl;
|
738
|
+
os << " r : " << r_ << std::endl;
|
739
|
+
os << " weight_r : " << total_wt_r_ << std::endl;
|
740
|
+
os << " Current size : " << curr_items_alloc_ << std::endl;
|
741
|
+
os << " Resize factor: " << (1 << rf_) << std::endl;
|
742
|
+
os << "### END SKETCH SUMMARY" << std::endl;
|
743
|
+
return os.str();
|
744
|
+
}
|
745
|
+
|
746
|
+
template<typename T, typename S, typename A>
|
747
|
+
string<A> var_opt_sketch<T,S,A>::items_to_string() const {
|
748
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
749
|
+
os << "### Sketch Items" << std::endl;
|
750
|
+
int idx = 0;
|
751
|
+
for (auto record : *this) {
|
752
|
+
os << idx << ": " << record.first << "\twt = " << record.second << std::endl;
|
753
|
+
++idx;
|
754
|
+
}
|
755
|
+
return os.str();
|
756
|
+
}
|
757
|
+
|
758
|
+
template<typename T, typename S, typename A>
|
759
|
+
string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
|
760
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
761
|
+
os << "### Sketch Items" << std::endl;
|
762
|
+
const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
|
763
|
+
for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
|
764
|
+
if (i == h_ && print_gap) {
|
765
|
+
os << i << ": GAP" << std::endl;
|
766
|
+
++display_idx;
|
767
|
+
} else {
|
768
|
+
os << i << ": " << data_[i] << "\twt = ";
|
769
|
+
if (weights_[i] == -1.0) {
|
770
|
+
os << get_tau() << "\t(-1.0)" << std::endl;
|
771
|
+
} else {
|
772
|
+
os << weights_[i] << std::endl;
|
773
|
+
}
|
774
|
+
++display_idx;
|
775
|
+
}
|
776
|
+
}
|
777
|
+
return os.str();
|
778
|
+
}
|
779
|
+
|
780
|
+
template<typename T, typename S, typename A>
|
781
|
+
template<typename O>
|
782
|
+
void var_opt_sketch<T,S,A>::update(O&& item, double weight, bool mark) {
|
783
|
+
if (weight < 0.0 || std::isnan(weight) || std::isinf(weight)) {
|
784
|
+
throw std::invalid_argument("Item weights must be nonnegative and finite. Found: "
|
785
|
+
+ std::to_string(weight));
|
786
|
+
} else if (weight == 0.0) {
|
787
|
+
return;
|
788
|
+
}
|
789
|
+
++n_;
|
790
|
+
|
791
|
+
if (r_ == 0) {
|
792
|
+
// exact mode
|
793
|
+
update_warmup_phase(std::forward<O>(item), weight, mark);
|
794
|
+
} else {
|
795
|
+
// sketch is in estimation mode so we can make the following check,
|
796
|
+
// although very conservative to check every time
|
797
|
+
if ((h_ != 0) && (peek_min() < get_tau()))
|
798
|
+
throw std::logic_error("sketch not in valid estimation mode");
|
799
|
+
|
800
|
+
// what tau would be if deletion candidates turn out to be R plus the new item
|
801
|
+
// note: (r_ + 1) - 1 is intentional
|
802
|
+
const double hypothetical_tau = (weight + total_wt_r_) / ((r_ + 1) - 1);
|
803
|
+
|
804
|
+
// is new item's turn to be considered for reservoir?
|
805
|
+
const double condition1 = (h_ == 0) || (weight <= peek_min());
|
806
|
+
|
807
|
+
// is new item light enough for reservoir?
|
808
|
+
const double condition2 = weight < hypothetical_tau;
|
809
|
+
|
810
|
+
if (condition1 && condition2) {
|
811
|
+
update_light(std::forward<O>(item), weight, mark);
|
812
|
+
} else if (r_ == 1) {
|
813
|
+
update_heavy_r_eq1(std::forward<O>(item), weight, mark);
|
814
|
+
} else {
|
815
|
+
update_heavy_general(std::forward<O>(item), weight, mark);
|
816
|
+
}
|
817
|
+
}
|
818
|
+
}
|
819
|
+
|
820
|
+
template<typename T, typename S, typename A>
|
821
|
+
template<typename O>
|
822
|
+
void var_opt_sketch<T,S,A>::update_warmup_phase(O&& item, double weight, bool mark) {
|
823
|
+
// seems overly cautious
|
824
|
+
if (r_ > 0 || m_ != 0 || h_ > k_) throw std::logic_error("invalid sketch state during warmup");
|
825
|
+
|
826
|
+
if (h_ >= curr_items_alloc_) {
|
827
|
+
grow_data_arrays();
|
828
|
+
}
|
829
|
+
|
830
|
+
// store items as they come in until full
|
831
|
+
new (&data_[h_]) T(std::forward<O>(item));
|
832
|
+
weights_[h_] = weight;
|
833
|
+
if (marks_ != nullptr) {
|
834
|
+
marks_[h_] = mark;
|
835
|
+
}
|
836
|
+
++h_;
|
837
|
+
num_marks_in_h_ += mark ? 1 : 0;
|
838
|
+
|
839
|
+
// check if need to heapify
|
840
|
+
if (h_ > k_) {
|
841
|
+
filled_data_ = true;
|
842
|
+
transition_from_warmup();
|
843
|
+
}
|
844
|
+
}
|
845
|
+
|
846
|
+
/* In the "light" case the new item has weight <= old_tau, so
|
847
|
+
would appear to the right of the R items in a hypothetical reverse-sorted
|
848
|
+
list. It is easy to prove that it is light enough to be part of this
|
849
|
+
round's downsampling */
|
850
|
+
template<typename T, typename S, typename A>
|
851
|
+
template<typename O>
|
852
|
+
void var_opt_sketch<T,S,A>::update_light(O&& item, double weight, bool mark) {
|
853
|
+
if (r_ == 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during light warmup");
|
854
|
+
|
855
|
+
const uint32_t m_slot = h_; // index of the gap, which becomes the M region
|
856
|
+
if (filled_data_) {
|
857
|
+
data_[m_slot] = std::forward<O>(item);
|
858
|
+
} else {
|
859
|
+
new (&data_[m_slot]) T(std::forward<O>(item));
|
860
|
+
filled_data_ = true;
|
861
|
+
}
|
862
|
+
weights_[m_slot] = weight;
|
863
|
+
if (marks_ != nullptr) { marks_[m_slot] = mark; }
|
864
|
+
++m_;
|
865
|
+
|
866
|
+
grow_candidate_set(total_wt_r_ + weight, r_ + 1);
|
867
|
+
}
|
868
|
+
|
869
|
+
/* In the "heavy" case the new item has weight > old_tau, so would
|
870
|
+
appear to the left of items in R in a hypothetical reverse-sorted list and
|
871
|
+
might or might not be light enough be part of this round's downsampling.
|
872
|
+
[After first splitting off the R=1 case] we greatly simplify the code by
|
873
|
+
putting the new item into the H heap whether it needs to be there or not.
|
874
|
+
In other words, it might go into the heap and then come right back out,
|
875
|
+
but that should be okay because pseudo_heavy items cannot predominate
|
876
|
+
in long streams unless (max wt) / (min wt) > o(exp(N)) */
|
877
|
+
template<typename T, typename S, typename A>
|
878
|
+
template<typename O>
|
879
|
+
void var_opt_sketch<T,S,A>::update_heavy_general(O&& item, double weight, bool mark) {
|
880
|
+
if (r_ < 2 || m_ != 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during heavy general update");
|
881
|
+
|
882
|
+
// put into H, although may come back out momentarily
|
883
|
+
push(std::forward<O>(item), weight, mark);
|
884
|
+
|
885
|
+
grow_candidate_set(total_wt_r_, r_);
|
886
|
+
}
|
887
|
+
|
888
|
+
/* The analysis of this case is similar to that of the general heavy case.
|
889
|
+
The one small technical difference is that since R < 2, we must grab an M item
|
890
|
+
to have a valid starting point for continue_by_growing_candidate_set () */
|
891
|
+
template<typename T, typename S, typename A>
|
892
|
+
template<typename O>
|
893
|
+
void var_opt_sketch<T,S,A>::update_heavy_r_eq1(O&& item, double weight, bool mark) {
|
894
|
+
if (r_ != 1 || m_ != 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during heavy r=1 update");
|
895
|
+
|
896
|
+
push(std::forward<O>(item), weight, mark); // new item into H
|
897
|
+
pop_min_to_m_region(); // pop lightest back into M
|
898
|
+
|
899
|
+
// Any set of two items is downsample-able to one item,
|
900
|
+
// so the two lightest items are a valid starting point for the following
|
901
|
+
const uint32_t m_slot = k_ - 1; // array is k+1, 1 in R, so slot before is M
|
902
|
+
grow_candidate_set(weights_[m_slot] + total_wt_r_, 2);
|
903
|
+
}
|
904
|
+
|
905
|
+
/**
|
906
|
+
* Decreases sketch's value of k by 1, updating stored values as needed.
|
907
|
+
*
|
908
|
+
* <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by
|
909
|
+
* the unioning algorithm to force "marked" items out of H and into the reservoir region.</p>
|
910
|
+
*/
|
911
|
+
template<typename T, typename S, typename A>
|
912
|
+
void var_opt_sketch<T,S,A>::decrease_k_by_1() {
|
913
|
+
if (k_ <= 1) {
|
914
|
+
throw std::logic_error("Cannot decrease k below 1 in union");
|
915
|
+
}
|
916
|
+
|
917
|
+
if ((h_ == 0) && (r_ == 0)) {
|
918
|
+
// exact mode, but no data yet; this reduction is somewhat gratuitous
|
919
|
+
--k_;
|
920
|
+
} else if ((h_ > 0) && (r_ == 0)) {
|
921
|
+
// exact mode, but we have some data
|
922
|
+
--k_;
|
923
|
+
if (h_ > k_) {
|
924
|
+
transition_from_warmup();
|
925
|
+
}
|
926
|
+
} else if ((h_ > 0) && (r_ > 0)) {
|
927
|
+
// reservoir mode, but we have some exact samples.
|
928
|
+
// Our strategy will be to pull an item out of H (which we are allowed to do since it's
|
929
|
+
// still just data), reduce k, and then re-insert the item
|
930
|
+
|
931
|
+
// first, slide the R zone to the left by 1, temporarily filling the gap
|
932
|
+
const uint32_t old_gap_idx = h_;
|
933
|
+
const uint32_t old_final_r_idx = (h_ + 1 + r_) - 1;
|
934
|
+
//if (old_final_r_idx != k_) throw std::logic_error("gadget in invalid state");
|
935
|
+
|
936
|
+
swap_values(old_final_r_idx, old_gap_idx);
|
937
|
+
|
938
|
+
// now we pull an item out of H; any item is ok, but if we grab the rightmost and then
|
939
|
+
// reduce h_, the heap invariant will be preserved (and the gap will be restored), plus
|
940
|
+
// the push() of the item that will probably happen later will be cheap.
|
941
|
+
|
942
|
+
const uint32_t pulled_idx = h_ - 1;
|
943
|
+
double pulled_weight = weights_[pulled_idx];
|
944
|
+
bool pulled_mark = marks_[pulled_idx];
|
945
|
+
// will move the pulled item below; don't do antying to it here
|
946
|
+
|
947
|
+
if (pulled_mark) { --num_marks_in_h_; }
|
948
|
+
weights_[pulled_idx] = -1.0; // to make bugs easier to spot
|
949
|
+
|
950
|
+
--h_;
|
951
|
+
--k_;
|
952
|
+
--n_; // will be re-incremented with the update
|
953
|
+
|
954
|
+
update(std::move(data_[pulled_idx]), pulled_weight, pulled_mark);
|
955
|
+
} else if ((h_ == 0) && (r_ > 0)) {
|
956
|
+
// pure reservoir mode, so can simply eject a randomly chosen sample from the reservoir
|
957
|
+
if (r_ < 2) throw std::logic_error("r_ too small for pure reservoir mode");
|
958
|
+
|
959
|
+
const uint32_t r_idx_to_delete = 1 + next_int(r_); // 1 for the gap
|
960
|
+
const uint32_t rightmost_r_idx = (1 + r_) - 1;
|
961
|
+
swap_values(r_idx_to_delete, rightmost_r_idx);
|
962
|
+
weights_[rightmost_r_idx] = -1.0;
|
963
|
+
|
964
|
+
--k_;
|
965
|
+
--r_;
|
966
|
+
}
|
967
|
+
}
|
968
|
+
|
969
|
+
template<typename T, typename S, typename A>
|
970
|
+
void var_opt_sketch<T,S,A>::allocate_data_arrays(uint32_t tgt_size, bool use_marks) {
|
971
|
+
filled_data_ = false;
|
972
|
+
|
973
|
+
data_ = A().allocate(tgt_size);
|
974
|
+
weights_ = AllocDouble().allocate(tgt_size);
|
975
|
+
|
976
|
+
if (use_marks) {
|
977
|
+
marks_ = AllocBool().allocate(tgt_size);
|
978
|
+
} else {
|
979
|
+
marks_ = nullptr;
|
980
|
+
}
|
981
|
+
}
|
982
|
+
|
983
|
+
template<typename T, typename S, typename A>
|
984
|
+
void var_opt_sketch<T,S,A>::grow_data_arrays() {
|
985
|
+
const uint32_t prev_size = curr_items_alloc_;
|
986
|
+
curr_items_alloc_ = get_adjusted_size(k_, curr_items_alloc_ << rf_);
|
987
|
+
if (curr_items_alloc_ == k_) {
|
988
|
+
++curr_items_alloc_;
|
989
|
+
}
|
990
|
+
|
991
|
+
if (prev_size < curr_items_alloc_) {
|
992
|
+
filled_data_ = false;
|
993
|
+
|
994
|
+
T* tmp_data = A().allocate(curr_items_alloc_);
|
995
|
+
double* tmp_weights = AllocDouble().allocate(curr_items_alloc_);
|
996
|
+
|
997
|
+
for (uint32_t i = 0; i < prev_size; ++i) {
|
998
|
+
new (&tmp_data[i]) T(std::move(data_[i]));
|
999
|
+
A().destroy(data_ + i);
|
1000
|
+
tmp_weights[i] = weights_[i];
|
1001
|
+
}
|
1002
|
+
|
1003
|
+
A().deallocate(data_, prev_size);
|
1004
|
+
AllocDouble().deallocate(weights_, prev_size);
|
1005
|
+
|
1006
|
+
data_ = tmp_data;
|
1007
|
+
weights_ = tmp_weights;
|
1008
|
+
|
1009
|
+
if (marks_ != nullptr) {
|
1010
|
+
bool* tmp_marks = AllocBool().allocate(curr_items_alloc_);
|
1011
|
+
for (uint32_t i = 0; i < prev_size; ++i) {
|
1012
|
+
tmp_marks[i] = marks_[i];
|
1013
|
+
}
|
1014
|
+
AllocBool().deallocate(marks_, prev_size);
|
1015
|
+
marks_ = tmp_marks;
|
1016
|
+
}
|
1017
|
+
}
|
1018
|
+
}
|
1019
|
+
|
1020
|
+
template<typename T, typename S, typename A>
|
1021
|
+
void var_opt_sketch<T,S,A>::transition_from_warmup() {
|
1022
|
+
// Move the 2 lightest items from H to M
|
1023
|
+
// But the lighter really belongs in R, so update counts to reflect that
|
1024
|
+
convert_to_heap();
|
1025
|
+
pop_min_to_m_region();
|
1026
|
+
pop_min_to_m_region();
|
1027
|
+
--m_;
|
1028
|
+
++r_;
|
1029
|
+
|
1030
|
+
if (h_ != (k_ -1) || m_ != 1 || r_ != 1)
|
1031
|
+
throw std::logic_error("invalid state for transitioning from warmup");
|
1032
|
+
|
1033
|
+
// Update total weight in R and then, having grabbed the value, overwrite
|
1034
|
+
// in weight_ array to help make bugs more obvious
|
1035
|
+
total_wt_r_ = weights_[k_]; // only one item, known location
|
1036
|
+
weights_[k_] = -1.0;
|
1037
|
+
|
1038
|
+
// The two lightest items are ncessarily downsample-able to one item,
|
1039
|
+
// and are therefore a valid initial candidate set
|
1040
|
+
grow_candidate_set(weights_[k_ - 1] + total_wt_r_, 2);
|
1041
|
+
}
|
1042
|
+
|
1043
|
+
template<typename T, typename S, typename A>
|
1044
|
+
void var_opt_sketch<T,S,A>::convert_to_heap() {
|
1045
|
+
if (h_ < 2) {
|
1046
|
+
return; // nothing to do
|
1047
|
+
}
|
1048
|
+
|
1049
|
+
const uint32_t last_slot = h_ - 1;
|
1050
|
+
const int last_non_leaf = ((last_slot + 1) / 2) - 1;
|
1051
|
+
|
1052
|
+
for (int j = last_non_leaf; j >= 0; --j) {
|
1053
|
+
restore_towards_leaves(j);
|
1054
|
+
}
|
1055
|
+
|
1056
|
+
// validates heap, used for initial debugging
|
1057
|
+
//for (uint32_t j = h_ - 1; j >= 1; --j) {
|
1058
|
+
// uint32_t p = ((j + 1) / 2) - 1;
|
1059
|
+
// if (weights_[p] > weights_[j]) throw std::logic_error("invalid heap");
|
1060
|
+
//}
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
template<typename T, typename S, typename A>
|
1064
|
+
void var_opt_sketch<T,S,A>::restore_towards_leaves(uint32_t slot_in) {
|
1065
|
+
const uint32_t last_slot = h_ - 1;
|
1066
|
+
if (h_ == 0 || slot_in > last_slot) throw std::logic_error("invalid heap state");
|
1067
|
+
|
1068
|
+
uint32_t slot = slot_in;
|
1069
|
+
uint32_t child = (2 * slot_in) + 1; // might be invalid, need to check
|
1070
|
+
|
1071
|
+
while (child <= last_slot) {
|
1072
|
+
uint32_t child2 = child + 1; // might also be invalid
|
1073
|
+
if ((child2 <= last_slot) && (weights_[child2] < weights_[child])) {
|
1074
|
+
// siwtch to other child if it's both valid and smaller
|
1075
|
+
child = child2;
|
1076
|
+
}
|
1077
|
+
|
1078
|
+
if (weights_[slot] <= weights_[child]) {
|
1079
|
+
// invariant holds so we're done
|
1080
|
+
break;
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
// swap and continue
|
1084
|
+
swap_values(slot, child);
|
1085
|
+
|
1086
|
+
slot = child;
|
1087
|
+
child = (2 * slot) + 1; // might be invalid, checked on next loop
|
1088
|
+
}
|
1089
|
+
}
|
1090
|
+
|
1091
|
+
template<typename T, typename S, typename A>
|
1092
|
+
void var_opt_sketch<T,S,A>::restore_towards_root(uint32_t slot_in) {
|
1093
|
+
uint32_t slot = slot_in;
|
1094
|
+
uint32_t p = (((slot + 1) / 2) - 1); // valid if slot >= 1
|
1095
|
+
while ((slot > 0) && (weights_[slot] < weights_[p])) {
|
1096
|
+
swap_values(slot, p);
|
1097
|
+
slot = p;
|
1098
|
+
p = (((slot + 1) / 2) - 1); // valid if slot >= 1
|
1099
|
+
}
|
1100
|
+
}
|
1101
|
+
|
1102
|
+
template<typename T, typename S, typename A>
|
1103
|
+
template<typename O>
|
1104
|
+
void var_opt_sketch<T,S,A>::push(O&& item, double wt, bool mark) {
|
1105
|
+
if (filled_data_) {
|
1106
|
+
data_[h_] = std::forward<O>(item);
|
1107
|
+
} else {
|
1108
|
+
new (&data_[h_]) T(std::forward<O>(item));
|
1109
|
+
filled_data_ = true;
|
1110
|
+
}
|
1111
|
+
weights_[h_] = wt;
|
1112
|
+
if (marks_ != nullptr) {
|
1113
|
+
marks_[h_] = mark;
|
1114
|
+
num_marks_in_h_ += (mark ? 1 : 0);
|
1115
|
+
}
|
1116
|
+
++h_;
|
1117
|
+
|
1118
|
+
restore_towards_root(h_ - 1); // need use old h_, but want accurate h_
|
1119
|
+
}
|
1120
|
+
|
1121
|
+
template<typename T, typename S, typename A>
|
1122
|
+
void var_opt_sketch<T,S,A>::pop_min_to_m_region() {
|
1123
|
+
if (h_ == 0 || (h_ + m_ + r_ != k_ + 1))
|
1124
|
+
throw std::logic_error("invalid heap state popping min to M region");
|
1125
|
+
|
1126
|
+
if (h_ == 1) {
|
1127
|
+
// just update bookkeeping
|
1128
|
+
++m_;
|
1129
|
+
--h_;
|
1130
|
+
} else {
|
1131
|
+
// main case
|
1132
|
+
uint32_t tgt = h_ - 1; // last slot, will swap with root
|
1133
|
+
swap_values(0, tgt);
|
1134
|
+
++m_;
|
1135
|
+
--h_;
|
1136
|
+
|
1137
|
+
restore_towards_leaves(0);
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
if (is_marked(h_)) {
|
1141
|
+
--num_marks_in_h_;
|
1142
|
+
}
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
|
1146
|
+
template<typename T, typename S, typename A>
|
1147
|
+
void var_opt_sketch<T,S,A>::swap_values(uint32_t src, uint32_t dst) {
|
1148
|
+
std::swap(data_[src], data_[dst]);
|
1149
|
+
std::swap(weights_[src], weights_[dst]);
|
1150
|
+
|
1151
|
+
if (marks_ != nullptr) {
|
1152
|
+
std::swap(marks_[src], marks_[dst]);
|
1153
|
+
}
|
1154
|
+
}
|
1155
|
+
|
1156
|
+
/* When entering here we should be in a well-characterized state where the
|
1157
|
+
new item has been placed in either h or m and we have a valid but not necessarily
|
1158
|
+
maximal sampling plan figured out. The array is completely full at this point.
|
1159
|
+
Everyone in h and m has an explicit weight. The candidates are right-justified
|
1160
|
+
and are either just the r set or the r set + exactly one m item. The number
|
1161
|
+
of cands is at least 2. We will now grow the candidate set as much as possible
|
1162
|
+
by pulling sufficiently light items from h to m.
|
1163
|
+
*/
|
1164
|
+
template<typename T, typename S, typename A>
|
1165
|
+
void var_opt_sketch<T,S,A>::grow_candidate_set(double wt_cands, uint32_t num_cands) {
|
1166
|
+
if ((h_ + m_ + r_ != k_ + 1) || (num_cands < 1) || (num_cands != m_ + r_) || (m_ >= 2))
|
1167
|
+
throw std::logic_error("invariant violated when growing candidate set");
|
1168
|
+
|
1169
|
+
while (h_ > 0) {
|
1170
|
+
const double next_wt = peek_min();
|
1171
|
+
const double next_tot_wt = wt_cands + next_wt;
|
1172
|
+
|
1173
|
+
// test for strict lightness of next prospect (denominator multiplied through)
|
1174
|
+
// ideally: (next_wt * (next_num_cands-1) < next_tot_wt)
|
1175
|
+
// but can use num_cands directly
|
1176
|
+
if ((next_wt * num_cands) < next_tot_wt) {
|
1177
|
+
wt_cands = next_tot_wt;
|
1178
|
+
++num_cands;
|
1179
|
+
pop_min_to_m_region(); // adjusts h_ and m_
|
1180
|
+
} else {
|
1181
|
+
break;
|
1182
|
+
}
|
1183
|
+
}
|
1184
|
+
|
1185
|
+
downsample_candidate_set(wt_cands, num_cands);
|
1186
|
+
}
|
1187
|
+
|
1188
|
+
template<typename T, typename S, typename A>
|
1189
|
+
void var_opt_sketch<T,S,A>::downsample_candidate_set(double wt_cands, uint32_t num_cands) {
|
1190
|
+
if (num_cands < 2 || h_ + num_cands != k_ + 1)
|
1191
|
+
throw std::logic_error("invalid num_cands when downsampling");
|
1192
|
+
|
1193
|
+
// need this before overwriting anything
|
1194
|
+
const uint32_t delete_slot = choose_delete_slot(wt_cands, num_cands);
|
1195
|
+
const uint32_t leftmost_cand_slot = h_;
|
1196
|
+
if (delete_slot < leftmost_cand_slot || delete_slot > k_)
|
1197
|
+
throw std::logic_error("invalid delete slot index when downsampling");
|
1198
|
+
|
1199
|
+
// Overwrite weights for items from M moving into R,
|
1200
|
+
// to make bugs more obvious. Also needed so anyone reading the
|
1201
|
+
// weight knows if it's invalid without checking h_ and m_
|
1202
|
+
const uint32_t stop_idx = leftmost_cand_slot + m_;
|
1203
|
+
for (uint32_t j = leftmost_cand_slot; j < stop_idx; ++j) {
|
1204
|
+
weights_[j] = -1.0;
|
1205
|
+
}
|
1206
|
+
|
1207
|
+
// The next two lines work even when delete_slot == leftmost_cand_slot
|
1208
|
+
data_[delete_slot] = std::move(data_[leftmost_cand_slot]);
|
1209
|
+
// cannot set data_[leftmost_cand_slot] to null since not uisng T*
|
1210
|
+
|
1211
|
+
m_ = 0;
|
1212
|
+
r_ = num_cands - 1;
|
1213
|
+
total_wt_r_ = wt_cands;
|
1214
|
+
}
|
1215
|
+
|
1216
|
+
template<typename T, typename S, typename A>
|
1217
|
+
uint32_t var_opt_sketch<T,S,A>::choose_delete_slot(double wt_cands, uint32_t num_cands) const {
|
1218
|
+
if (r_ == 0) throw std::logic_error("choosing delete slot while in exact mode");
|
1219
|
+
|
1220
|
+
if (m_ == 0) {
|
1221
|
+
// this happens if we insert a really heavy item
|
1222
|
+
return pick_random_slot_in_r();
|
1223
|
+
} else if (m_ == 1) {
|
1224
|
+
// check if we keep th item in M or pick oen from R
|
1225
|
+
// p(keep) = (num_cand - 1) * wt_M / wt_cand
|
1226
|
+
double wt_m_cand = weights_[h_]; // slot of item in M is h_
|
1227
|
+
if ((wt_cands * next_double_exclude_zero()) < ((num_cands - 1) * wt_m_cand)) {
|
1228
|
+
return pick_random_slot_in_r(); // keep item in M
|
1229
|
+
} else {
|
1230
|
+
return h_; // indext of item in M
|
1231
|
+
}
|
1232
|
+
} else {
|
1233
|
+
// general case
|
1234
|
+
const uint32_t delete_slot = choose_weighted_delete_slot(wt_cands, num_cands);
|
1235
|
+
const uint32_t first_r_slot = h_ + m_;
|
1236
|
+
if (delete_slot == first_r_slot) {
|
1237
|
+
return pick_random_slot_in_r();
|
1238
|
+
} else {
|
1239
|
+
return delete_slot;
|
1240
|
+
}
|
1241
|
+
}
|
1242
|
+
}
|
1243
|
+
|
1244
|
+
template<typename T, typename S, typename A>
|
1245
|
+
uint32_t var_opt_sketch<T,S,A>::choose_weighted_delete_slot(double wt_cands, uint32_t num_cands) const {
|
1246
|
+
if (m_ < 1) throw std::logic_error("must have weighted delete slot");
|
1247
|
+
|
1248
|
+
const uint32_t offset = h_;
|
1249
|
+
const uint32_t final_m = (offset + m_) - 1;
|
1250
|
+
const uint32_t num_to_keep = num_cands - 1;
|
1251
|
+
|
1252
|
+
double left_subtotal = 0.0;
|
1253
|
+
double right_subtotal = -1.0 * wt_cands * next_double_exclude_zero();
|
1254
|
+
|
1255
|
+
for (uint32_t i = offset; i <= final_m; ++i) {
|
1256
|
+
left_subtotal += num_to_keep * weights_[i];
|
1257
|
+
right_subtotal += wt_cands;
|
1258
|
+
|
1259
|
+
if (left_subtotal < right_subtotal) {
|
1260
|
+
return i;
|
1261
|
+
}
|
1262
|
+
}
|
1263
|
+
|
1264
|
+
// this slot tells caller that we need to delete out of R
|
1265
|
+
return final_m + 1;
|
1266
|
+
}
|
1267
|
+
|
1268
|
+
template<typename T, typename S, typename A>
|
1269
|
+
uint32_t var_opt_sketch<T,S,A>::pick_random_slot_in_r() const {
|
1270
|
+
if (r_ == 0) throw std::logic_error("r_ = 0 when picking slot in R region");
|
1271
|
+
const uint32_t offset = h_ + m_;
|
1272
|
+
if (r_ == 1) {
|
1273
|
+
return offset;
|
1274
|
+
} else {
|
1275
|
+
return offset + next_int(r_);
|
1276
|
+
}
|
1277
|
+
}
|
1278
|
+
|
1279
|
+
template<typename T, typename S, typename A>
|
1280
|
+
double var_opt_sketch<T,S,A>::peek_min() const {
|
1281
|
+
if (h_ == 0) throw std::logic_error("h_ = 0 when checking min in H region");
|
1282
|
+
return weights_[0];
|
1283
|
+
}
|
1284
|
+
|
1285
|
+
template<typename T, typename S, typename A>
|
1286
|
+
inline bool var_opt_sketch<T,S,A>::is_marked(uint32_t idx) const {
|
1287
|
+
return marks_ == nullptr ? false : marks_[idx];
|
1288
|
+
}
|
1289
|
+
|
1290
|
+
template<typename T, typename S, typename A>
|
1291
|
+
double var_opt_sketch<T,S,A>::get_tau() const {
|
1292
|
+
return r_ == 0 ? std::nan("1") : (total_wt_r_ / r_);
|
1293
|
+
}
|
1294
|
+
|
1295
|
+
template<typename T, typename S, typename A>
|
1296
|
+
void var_opt_sketch<T,S,A>::strip_marks() {
|
1297
|
+
if (marks_ == nullptr) throw std::logic_error("request to strip marks from non-gadget");
|
1298
|
+
num_marks_in_h_ = 0;
|
1299
|
+
AllocBool().deallocate(marks_, curr_items_alloc_);
|
1300
|
+
marks_ = nullptr;
|
1301
|
+
}
|
1302
|
+
|
1303
|
+
template<typename T, typename S, typename A>
|
1304
|
+
void var_opt_sketch<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
|
1305
|
+
const bool is_empty(flags & EMPTY_FLAG_MASK);
|
1306
|
+
|
1307
|
+
if (is_empty) {
|
1308
|
+
if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
|
1309
|
+
throw std::invalid_argument("Possible corruption: Preamble longs must be "
|
1310
|
+
+ std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
|
1311
|
+
+ std::to_string(preamble_longs));
|
1312
|
+
}
|
1313
|
+
} else {
|
1314
|
+
if (preamble_longs != PREAMBLE_LONGS_WARMUP
|
1315
|
+
&& preamble_longs != PREAMBLE_LONGS_FULL) {
|
1316
|
+
throw std::invalid_argument("Possible corruption: Preamble longs must be "
|
1317
|
+
+ std::to_string(PREAMBLE_LONGS_WARMUP) + " or "
|
1318
|
+
+ std::to_string(PREAMBLE_LONGS_FULL)
|
1319
|
+
+ " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
|
1320
|
+
}
|
1321
|
+
}
|
1322
|
+
}
|
1323
|
+
|
1324
|
+
template<typename T, typename S, typename A>
|
1325
|
+
void var_opt_sketch<T,S,A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
|
1326
|
+
if (family_id == FAMILY_ID) {
|
1327
|
+
if (ser_ver != SER_VER) {
|
1328
|
+
throw std::invalid_argument("Possible corruption: VarOpt serialization version must be "
|
1329
|
+
+ std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));
|
1330
|
+
}
|
1331
|
+
return;
|
1332
|
+
}
|
1333
|
+
// TODO: extend to handle reservoir sampling
|
1334
|
+
|
1335
|
+
throw std::invalid_argument("Possible corruption: VarOpt family id must be "
|
1336
|
+
+ std::to_string(FAMILY_ID) + ". Found: " + std::to_string(family_id));
|
1337
|
+
}
|
1338
|
+
|
1339
|
+
template<typename T, typename S, typename A>
|
1340
|
+
uint32_t var_opt_sketch<T, S, A>::validate_and_get_target_size(uint32_t preamble_longs, uint32_t k, uint64_t n,
|
1341
|
+
uint32_t h, uint32_t r, resize_factor rf) {
|
1342
|
+
if (k == 0 || k > MAX_K) {
|
1343
|
+
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
1344
|
+
}
|
1345
|
+
|
1346
|
+
uint32_t array_size;
|
1347
|
+
|
1348
|
+
if (n <= k) {
|
1349
|
+
if (preamble_longs != PREAMBLE_LONGS_WARMUP) {
|
1350
|
+
throw std::invalid_argument("Possible corruption: deserializing with n <= k but not in warmup mode. "
|
1351
|
+
"Found n = " + std::to_string(n) + ", k = " + std::to_string(k));
|
1352
|
+
}
|
1353
|
+
if (n != h) {
|
1354
|
+
throw std::invalid_argument("Possible corruption: deserializing in warmup mode but n != h. "
|
1355
|
+
"Found n = " + std::to_string(n) + ", h = " + std::to_string(h));
|
1356
|
+
}
|
1357
|
+
if (r > 0) {
|
1358
|
+
throw std::invalid_argument("Possible corruption: deserializing in warmup mode but r > 0. "
|
1359
|
+
"Found r = " + std::to_string(r));
|
1360
|
+
}
|
1361
|
+
|
1362
|
+
const uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k));
|
1363
|
+
const uint32_t min_lg_size = to_log_2(ceiling_power_of_2(h));
|
1364
|
+
const uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf, min_lg_size);
|
1365
|
+
array_size = get_adjusted_size(k, 1 << initial_lg_size);
|
1366
|
+
if (array_size == k) { // if full size, need to leave 1 for the gap
|
1367
|
+
++array_size;
|
1368
|
+
}
|
1369
|
+
} else { // n > k
|
1370
|
+
if (preamble_longs != PREAMBLE_LONGS_FULL) {
|
1371
|
+
throw std::invalid_argument("Possible corruption: deserializing with n > k but not in full mode. "
|
1372
|
+
"Found n = " + std::to_string(n) + ", k = " + std::to_string(k));
|
1373
|
+
}
|
1374
|
+
if (h + r != k) {
|
1375
|
+
throw std::invalid_argument("Possible corruption: deserializing in full mode but h + r != n. "
|
1376
|
+
"Found h = " + std::to_string(h) + ", r = " + std::to_string(r) + ", n = " + std::to_string(n));
|
1377
|
+
}
|
1378
|
+
|
1379
|
+
array_size = k + 1;
|
1380
|
+
}
|
1381
|
+
|
1382
|
+
return array_size;
|
1383
|
+
}
|
1384
|
+
|
1385
|
+
template<typename T, typename S, typename A>
|
1386
|
+
template<typename P>
|
1387
|
+
subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(P predicate) const {
|
1388
|
+
if (n_ == 0) {
|
1389
|
+
return {0.0, 0.0, 0.0, 0.0};
|
1390
|
+
}
|
1391
|
+
|
1392
|
+
double total_wt_h = 0.0;
|
1393
|
+
double h_true_wt = 0.0;
|
1394
|
+
size_t idx = 0;
|
1395
|
+
for (; idx < h_; ++idx) {
|
1396
|
+
double wt = weights_[idx];
|
1397
|
+
total_wt_h += wt;
|
1398
|
+
if (predicate(data_[idx])) {
|
1399
|
+
h_true_wt += wt;
|
1400
|
+
}
|
1401
|
+
}
|
1402
|
+
|
1403
|
+
// if only heavy items, we have an exact answer
|
1404
|
+
if (r_ == 0) {
|
1405
|
+
return {h_true_wt, h_true_wt, h_true_wt, h_true_wt};
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
// since r_ > 0, we know we have samples
|
1409
|
+
const uint64_t num_samples = n_ - h_;
|
1410
|
+
double effective_sampling_rate = r_ / static_cast<double>(num_samples);
|
1411
|
+
if (effective_sampling_rate < 0.0 || effective_sampling_rate > 1.0)
|
1412
|
+
throw std::logic_error("invalid sampling rate outside [0.0, 1.0]");
|
1413
|
+
|
1414
|
+
size_t r_true_count = 0;
|
1415
|
+
++idx; // skip the gap
|
1416
|
+
for (; idx < (k_ + 1); ++idx) {
|
1417
|
+
if (predicate(data_[idx])) {
|
1418
|
+
++r_true_count;
|
1419
|
+
}
|
1420
|
+
}
|
1421
|
+
|
1422
|
+
double lb_true_fraction = pseudo_hypergeometric_lb_on_p(r_, r_true_count, effective_sampling_rate);
|
1423
|
+
double estimated_true_fraction = (1.0 * r_true_count) / r_;
|
1424
|
+
double ub_true_fraction = pseudo_hypergeometric_ub_on_p(r_, r_true_count, effective_sampling_rate);
|
1425
|
+
|
1426
|
+
return { h_true_wt + (total_wt_r_ * lb_true_fraction),
|
1427
|
+
h_true_wt + (total_wt_r_ * estimated_true_fraction),
|
1428
|
+
h_true_wt + (total_wt_r_ * ub_true_fraction),
|
1429
|
+
total_wt_h + total_wt_r_
|
1430
|
+
};
|
1431
|
+
}
|
1432
|
+
|
1433
|
+
template<typename T, typename S, typename A>
|
1434
|
+
class var_opt_sketch<T, S, A>::items_deleter {
|
1435
|
+
public:
|
1436
|
+
items_deleter(uint32_t num) : num(num), h_count(0), r_count(0) {}
|
1437
|
+
void set_h(uint32_t h) { h_count = h; }
|
1438
|
+
void set_r(uint32_t r) { r_count = r; }
|
1439
|
+
void operator() (T* ptr) const {
|
1440
|
+
if (h_count > 0) {
|
1441
|
+
for (size_t i = 0; i < h_count; ++i) {
|
1442
|
+
ptr[i].~T();
|
1443
|
+
}
|
1444
|
+
}
|
1445
|
+
if (r_count > 0) {
|
1446
|
+
uint32_t end = h_count + r_count + 1;
|
1447
|
+
for (size_t i = h_count + 1; i < end; ++i) {
|
1448
|
+
ptr[i].~T();
|
1449
|
+
}
|
1450
|
+
}
|
1451
|
+
if (ptr != nullptr) {
|
1452
|
+
A().deallocate(ptr, num);
|
1453
|
+
}
|
1454
|
+
}
|
1455
|
+
private:
|
1456
|
+
uint32_t num;
|
1457
|
+
uint32_t h_count;
|
1458
|
+
uint32_t r_count;
|
1459
|
+
};
|
1460
|
+
|
1461
|
+
template<typename T, typename S, typename A>
|
1462
|
+
class var_opt_sketch<T, S, A>::weights_deleter {
|
1463
|
+
public:
|
1464
|
+
weights_deleter(uint32_t num) : num(num) {}
|
1465
|
+
void operator() (double* ptr) const {
|
1466
|
+
if (ptr != nullptr) {
|
1467
|
+
AllocDouble().deallocate(ptr, num);
|
1468
|
+
}
|
1469
|
+
}
|
1470
|
+
private:
|
1471
|
+
uint32_t num;
|
1472
|
+
};
|
1473
|
+
|
1474
|
+
template<typename T, typename S, typename A>
|
1475
|
+
class var_opt_sketch<T, S, A>::marks_deleter {
|
1476
|
+
public:
|
1477
|
+
marks_deleter(uint32_t num) : num(num) {}
|
1478
|
+
void operator() (bool* ptr) const {
|
1479
|
+
if (ptr != nullptr) {
|
1480
|
+
AllocBool().deallocate(ptr, 1);
|
1481
|
+
}
|
1482
|
+
}
|
1483
|
+
private:
|
1484
|
+
uint32_t num;
|
1485
|
+
};
|
1486
|
+
|
1487
|
+
|
1488
|
+
template<typename T, typename S, typename A>
|
1489
|
+
typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::begin() const {
|
1490
|
+
return var_opt_sketch<T, S, A>::const_iterator(*this, false);
|
1491
|
+
}
|
1492
|
+
|
1493
|
+
template<typename T, typename S, typename A>
|
1494
|
+
typename var_opt_sketch<T, S, A>::const_iterator var_opt_sketch<T, S, A>::end() const {
|
1495
|
+
return var_opt_sketch<T, S, A>::const_iterator(*this, true);
|
1496
|
+
}
|
1497
|
+
|
1498
|
+
// -------- var_opt_sketch::const_iterator implementation ---------
|
1499
|
+
|
1500
|
+
template<typename T, typename S, typename A>
|
1501
|
+
var_opt_sketch<T,S,A>::const_iterator::const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end) :
|
1502
|
+
sk_(&sk),
|
1503
|
+
cum_r_weight_(0.0),
|
1504
|
+
r_item_wt_(sk.get_tau()),
|
1505
|
+
final_idx_(sk.r_ > 0 ? sk.h_ + sk.r_ + 1 : sk.h_)
|
1506
|
+
{
|
1507
|
+
// index logic easier to read if not inline
|
1508
|
+
if (is_end) {
|
1509
|
+
idx_ = final_idx_;
|
1510
|
+
sk_ = nullptr;
|
1511
|
+
} else {
|
1512
|
+
idx_ = (sk.h_ == 0 && sk.r_ > 0 ? 1 : 0); // skip if gap is at start
|
1513
|
+
}
|
1514
|
+
|
1515
|
+
// should only apply if sketch is empty
|
1516
|
+
if (idx_ == final_idx_) { sk_ = nullptr; }
|
1517
|
+
}
|
1518
|
+
|
1519
|
+
template<typename T, typename S, typename A>
|
1520
|
+
var_opt_sketch<T,S,A>::const_iterator::const_iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region) :
|
1521
|
+
sk_(&sk),
|
1522
|
+
cum_r_weight_(0.0),
|
1523
|
+
r_item_wt_(sk.get_tau()),
|
1524
|
+
final_idx_(sk.h_ + (use_r_region ? 1 + sk.r_ : 0))
|
1525
|
+
{
|
1526
|
+
if (use_r_region) {
|
1527
|
+
idx_ = sk.h_ + 1 + (is_end ? sk.r_ : 0);
|
1528
|
+
} else { // H region
|
1529
|
+
// gap at start only if h_ == 0, so index always starts at 0
|
1530
|
+
idx_ = (is_end ? sk.h_ : 0);
|
1531
|
+
}
|
1532
|
+
|
1533
|
+
// unlike in full iterator case, may happen even if sketch is not empty
|
1534
|
+
if (idx_ == final_idx_) { sk_ = nullptr; }
|
1535
|
+
}
|
1536
|
+
|
1537
|
+
|
1538
|
+
template<typename T, typename S, typename A>
|
1539
|
+
var_opt_sketch<T, S, A>::const_iterator::const_iterator(const const_iterator& other) :
|
1540
|
+
sk_(other.sk_),
|
1541
|
+
cum_r_weight_(other.cum_r_weight_),
|
1542
|
+
r_item_wt_(other.r_item_wt_),
|
1543
|
+
idx_(other.idx_),
|
1544
|
+
final_idx_(other.final_idx_)
|
1545
|
+
{}
|
1546
|
+
|
1547
|
+
template<typename T, typename S, typename A>
|
1548
|
+
typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++() {
|
1549
|
+
++idx_;
|
1550
|
+
|
1551
|
+
if (idx_ == final_idx_) {
|
1552
|
+
sk_ = nullptr;
|
1553
|
+
return *this;
|
1554
|
+
} else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
|
1555
|
+
++idx_;
|
1556
|
+
}
|
1557
|
+
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1558
|
+
return *this;
|
1559
|
+
}
|
1560
|
+
|
1561
|
+
template<typename T, typename S, typename A>
|
1562
|
+
typename var_opt_sketch<T, S, A>::const_iterator& var_opt_sketch<T, S, A>::const_iterator::operator++(int) {
|
1563
|
+
const_iterator tmp(*this);
|
1564
|
+
operator++();
|
1565
|
+
return tmp;
|
1566
|
+
}
|
1567
|
+
|
1568
|
+
template<typename T, typename S, typename A>
|
1569
|
+
bool var_opt_sketch<T, S, A>::const_iterator::operator==(const const_iterator& other) const {
|
1570
|
+
if (sk_ != other.sk_) return false;
|
1571
|
+
if (sk_ == nullptr) return true; // end (and we know other.sk_ is also null)
|
1572
|
+
return idx_ == other.idx_;
|
1573
|
+
}
|
1574
|
+
|
1575
|
+
template<typename T, typename S, typename A>
|
1576
|
+
bool var_opt_sketch<T, S, A>::const_iterator::operator!=(const const_iterator& other) const {
|
1577
|
+
return !operator==(other);
|
1578
|
+
}
|
1579
|
+
|
1580
|
+
template<typename T, typename S, typename A>
|
1581
|
+
const std::pair<const T&, const double> var_opt_sketch<T, S, A>::const_iterator::operator*() const {
|
1582
|
+
double wt;
|
1583
|
+
if (idx_ < sk_->h_) {
|
1584
|
+
wt = sk_->weights_[idx_];
|
1585
|
+
} else {
|
1586
|
+
wt = r_item_wt_;
|
1587
|
+
}
|
1588
|
+
return std::pair<const T&, const double>(sk_->data_[idx_], wt);
|
1589
|
+
}
|
1590
|
+
|
1591
|
+
template<typename T, typename S, typename A>
|
1592
|
+
bool var_opt_sketch<T, S, A>::const_iterator::get_mark() const {
|
1593
|
+
return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
|
1594
|
+
}
|
1595
|
+
|
1596
|
+
|
1597
|
+
// -------- var_opt_sketch::iterator implementation ---------
|
1598
|
+
|
1599
|
+
template<typename T, typename S, typename A>
|
1600
|
+
var_opt_sketch<T,S,A>::iterator::iterator(const var_opt_sketch<T,S,A>& sk, bool is_end, bool use_r_region) :
|
1601
|
+
sk_(&sk),
|
1602
|
+
cum_r_weight_(0.0),
|
1603
|
+
r_item_wt_(sk.get_tau()),
|
1604
|
+
final_idx_(sk.h_ + (use_r_region ? 1 + sk.r_ : 0))
|
1605
|
+
{
|
1606
|
+
if (use_r_region) {
|
1607
|
+
idx_ = sk.h_ + 1 + (is_end ? sk.r_ : 0);
|
1608
|
+
} else { // H region
|
1609
|
+
// gap at start only if h_ == 0, so index always starts at 0
|
1610
|
+
idx_ = (is_end ? sk.h_ : 0);
|
1611
|
+
}
|
1612
|
+
|
1613
|
+
// unlike in full iterator case, may happen even if sketch is not empty
|
1614
|
+
if (idx_ == final_idx_) { sk_ = nullptr; }
|
1615
|
+
}
|
1616
|
+
|
1617
|
+
template<typename T, typename S, typename A>
|
1618
|
+
var_opt_sketch<T, S, A>::iterator::iterator(const iterator& other) :
|
1619
|
+
sk_(other.sk_),
|
1620
|
+
cum_r_weight_(other.cum_r_weight_),
|
1621
|
+
r_item_wt_(other.r_item_wt_),
|
1622
|
+
idx_(other.idx_),
|
1623
|
+
final_idx_(other.final_idx_)
|
1624
|
+
{}
|
1625
|
+
|
1626
|
+
template<typename T, typename S, typename A>
|
1627
|
+
typename var_opt_sketch<T, S, A>::iterator& var_opt_sketch<T, S, A>::iterator::operator++() {
|
1628
|
+
++idx_;
|
1629
|
+
|
1630
|
+
if (idx_ == final_idx_) {
|
1631
|
+
sk_ = nullptr;
|
1632
|
+
return *this;
|
1633
|
+
} else if (idx_ == sk_->h_ && sk_->r_ > 0) { // check for the gap
|
1634
|
+
++idx_;
|
1635
|
+
}
|
1636
|
+
if (idx_ > sk_->h_) { cum_r_weight_ += r_item_wt_; }
|
1637
|
+
return *this;
|
1638
|
+
}
|
1639
|
+
|
1640
|
+
template<typename T, typename S, typename A>
|
1641
|
+
typename var_opt_sketch<T, S, A>::iterator& var_opt_sketch<T, S, A>::iterator::operator++(int) {
|
1642
|
+
const_iterator tmp(*this);
|
1643
|
+
operator++();
|
1644
|
+
return tmp;
|
1645
|
+
}
|
1646
|
+
|
1647
|
+
template<typename T, typename S, typename A>
|
1648
|
+
bool var_opt_sketch<T, S, A>::iterator::operator==(const iterator& other) const {
|
1649
|
+
if (sk_ != other.sk_) return false;
|
1650
|
+
if (sk_ == nullptr) return true; // end (and we know other.sk_ is also null)
|
1651
|
+
return idx_ == other.idx_;
|
1652
|
+
}
|
1653
|
+
|
1654
|
+
template<typename T, typename S, typename A>
|
1655
|
+
bool var_opt_sketch<T, S, A>::iterator::operator!=(const iterator& other) const {
|
1656
|
+
return !operator==(other);
|
1657
|
+
}
|
1658
|
+
|
1659
|
+
template<typename T, typename S, typename A>
|
1660
|
+
std::pair<T&, double> var_opt_sketch<T, S, A>::iterator::operator*() {
|
1661
|
+
double wt;
|
1662
|
+
if (idx_ < sk_->h_) {
|
1663
|
+
wt = sk_->weights_[idx_];
|
1664
|
+
} else if (idx_ == final_idx_ - 1) {
|
1665
|
+
wt = sk_->total_wt_r_ - cum_r_weight_;
|
1666
|
+
} else {
|
1667
|
+
wt = r_item_wt_;
|
1668
|
+
}
|
1669
|
+
return std::pair<T&, double>(sk_->data_[idx_], wt);
|
1670
|
+
}
|
1671
|
+
|
1672
|
+
template<typename T, typename S, typename A>
|
1673
|
+
bool var_opt_sketch<T, S, A>::iterator::get_mark() const {
|
1674
|
+
return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
|
1675
|
+
}
|
1676
|
+
|
1677
|
+
|
1678
|
+
|
1679
|
+
// ******************** MOVE TO COMMON UTILS AREA EVENTUALLY *********************
|
1680
|
+
|
1681
|
+
namespace random_utils {
|
1682
|
+
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
|
1683
|
+
static std::mt19937_64 rand(rd());
|
1684
|
+
static std::uniform_real_distribution<> next_double(0.0, 1.0);
|
1685
|
+
}
|
1686
|
+
|
1687
|
+
/**
|
1688
|
+
* Checks if target sampling allocation is more than 50% of max sampling size.
|
1689
|
+
* If so, returns max sampling size, otherwise passes through target size.
|
1690
|
+
*/
|
1691
|
+
template<typename T, typename S, typename A>
|
1692
|
+
uint32_t var_opt_sketch<T,S,A>::get_adjusted_size(uint32_t max_size, uint32_t resize_target) {
|
1693
|
+
if (max_size - (resize_target << 1) < 0L) {
|
1694
|
+
return max_size;
|
1695
|
+
}
|
1696
|
+
return resize_target;
|
1697
|
+
}
|
1698
|
+
|
1699
|
+
template<typename T, typename S, typename A>
|
1700
|
+
uint32_t var_opt_sketch<T,S,A>::starting_sub_multiple(uint32_t lg_target, uint32_t lg_rf, uint32_t lg_min) {
|
1701
|
+
return (lg_target <= lg_min)
|
1702
|
+
? lg_min : (lg_rf == 0) ? lg_target
|
1703
|
+
: (lg_target - lg_min) % lg_rf + lg_min;
|
1704
|
+
}
|
1705
|
+
|
1706
|
+
template<typename T, typename S, typename A>
|
1707
|
+
double var_opt_sketch<T,S,A>::pseudo_hypergeometric_ub_on_p(uint64_t n, uint32_t k, double sampling_rate) {
|
1708
|
+
const double adjusted_kappa = DEFAULT_KAPPA * sqrt(1 - sampling_rate);
|
1709
|
+
return bounds_binomial_proportions::approximate_upper_bound_on_p(n, k, adjusted_kappa);
|
1710
|
+
}
|
1711
|
+
|
1712
|
+
template<typename T, typename S, typename A>
|
1713
|
+
double var_opt_sketch<T,S,A>::pseudo_hypergeometric_lb_on_p(uint64_t n, uint32_t k, double sampling_rate) {
|
1714
|
+
const double adjusted_kappa = DEFAULT_KAPPA * sqrt(1 - sampling_rate);
|
1715
|
+
return bounds_binomial_proportions::approximate_lower_bound_on_p(n, k, adjusted_kappa);
|
1716
|
+
}
|
1717
|
+
|
1718
|
+
template<typename T, typename S, typename A>
|
1719
|
+
bool var_opt_sketch<T,S,A>::is_power_of_2(uint32_t v) {
|
1720
|
+
return v && !(v & (v - 1));
|
1721
|
+
}
|
1722
|
+
|
1723
|
+
template<typename T, typename S, typename A>
|
1724
|
+
uint32_t var_opt_sketch<T,S,A>::to_log_2(uint32_t v) {
|
1725
|
+
if (is_power_of_2(v)) {
|
1726
|
+
return count_trailing_zeros_in_u32(v);
|
1727
|
+
} else {
|
1728
|
+
throw std::invalid_argument("Attempt to compute integer log2 of non-positive or non-power of 2");
|
1729
|
+
}
|
1730
|
+
}
|
1731
|
+
|
1732
|
+
// Returns an integer in the range [0, max_value) -- excludes max_value
|
1733
|
+
template<typename T, typename S, typename A>
|
1734
|
+
uint32_t var_opt_sketch<T,S,A>::next_int(uint32_t max_value) {
|
1735
|
+
std::uniform_int_distribution<uint32_t> dist(0, max_value - 1);
|
1736
|
+
return dist(random_utils::rand);
|
1737
|
+
}
|
1738
|
+
|
1739
|
+
template<typename T, typename S, typename A>
|
1740
|
+
double var_opt_sketch<T,S,A>::next_double_exclude_zero() {
|
1741
|
+
double r = random_utils::next_double(random_utils::rand);
|
1742
|
+
while (r == 0.0) {
|
1743
|
+
r = random_utils::next_double(random_utils::rand);
|
1744
|
+
}
|
1745
|
+
return r;
|
1746
|
+
}
|
1747
|
+
|
1748
|
+
}
|
1749
|
+
|
1750
|
+
// namespace datasketches
|
1751
|
+
|
1752
|
+
#endif // _VAR_OPT_SKETCH_IMPL_HPP_
|