datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
add_library(theta INTERFACE)
|
|
19
|
+
|
|
20
|
+
add_library(${PROJECT_NAME}::THETA ALIAS theta)
|
|
21
|
+
|
|
22
|
+
if (BUILD_TESTS)
|
|
23
|
+
add_subdirectory(test)
|
|
24
|
+
endif()
|
|
25
|
+
|
|
26
|
+
target_include_directories(theta
|
|
27
|
+
INTERFACE
|
|
28
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
|
29
|
+
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
target_link_libraries(theta INTERFACE common)
|
|
33
|
+
target_compile_features(theta INTERFACE cxx_std_11)
|
|
34
|
+
|
|
35
|
+
set(theta_HEADERS "")
|
|
36
|
+
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
|
|
37
|
+
list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_sketch_impl.hpp")
|
|
38
|
+
list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
|
|
39
|
+
|
|
40
|
+
install(TARGETS theta
|
|
41
|
+
EXPORT ${PROJECT_NAME}
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
install(FILES ${theta_HEADERS}
|
|
45
|
+
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
46
|
+
|
|
47
|
+
target_sources(theta
|
|
48
|
+
INTERFACE
|
|
49
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
|
|
50
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
|
|
51
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
|
|
52
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
|
|
53
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
|
|
54
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
|
|
55
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
|
|
56
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
|
|
57
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef THETA_A_NOT_B_HPP_
|
|
21
|
+
#define THETA_A_NOT_B_HPP_
|
|
22
|
+
|
|
23
|
+
#include <memory>
|
|
24
|
+
#include <functional>
|
|
25
|
+
#include <climits>
|
|
26
|
+
|
|
27
|
+
#include "theta_sketch.hpp"
|
|
28
|
+
#include "common_defs.hpp"
|
|
29
|
+
|
|
30
|
+
namespace datasketches {
|
|
31
|
+
|
|
32
|
+
/*
|
|
33
|
+
* author Alexander Saydakov
|
|
34
|
+
* author Lee Rhodes
|
|
35
|
+
* author Kevin Lang
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
template<typename A>
|
|
39
|
+
class theta_a_not_b_alloc {
|
|
40
|
+
public:
|
|
41
|
+
/**
|
|
42
|
+
* Creates an instance of the a-not-b operation (set difference) with a given has seed.
|
|
43
|
+
* @param seed hash seed
|
|
44
|
+
*/
|
|
45
|
+
explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED);
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Computes the a-not-b set operation given two sketches.
|
|
49
|
+
* @return the result of a-not-b
|
|
50
|
+
*/
|
|
51
|
+
compact_theta_sketch_alloc<A> compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered = true) const;
|
|
52
|
+
|
|
53
|
+
private:
|
|
54
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
|
|
55
|
+
uint16_t seed_hash_;
|
|
56
|
+
|
|
57
|
+
class less_than {
|
|
58
|
+
public:
|
|
59
|
+
explicit less_than(uint64_t value): value(value) {}
|
|
60
|
+
bool operator()(uint64_t value) const { return value < this->value; }
|
|
61
|
+
private:
|
|
62
|
+
uint64_t value;
|
|
63
|
+
};
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
// alias with default allocator for convenience
|
|
67
|
+
typedef theta_a_not_b_alloc<std::allocator<void>> theta_a_not_b;
|
|
68
|
+
|
|
69
|
+
} /* namespace datasketches */
|
|
70
|
+
|
|
71
|
+
#include "theta_a_not_b_impl.hpp"
|
|
72
|
+
|
|
73
|
+
# endif
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef THETA_A_NOT_B_IMPL_HPP_
|
|
21
|
+
#define THETA_A_NOT_B_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <algorithm>
|
|
24
|
+
|
|
25
|
+
#include "conditional_back_inserter.hpp"
|
|
26
|
+
|
|
27
|
+
namespace datasketches {
|
|
28
|
+
|
|
29
|
+
/*
|
|
30
|
+
* author Alexander Saydakov
|
|
31
|
+
* author Lee Rhodes
|
|
32
|
+
* author Kevin Lang
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
template<typename A>
|
|
36
|
+
theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
|
|
37
|
+
seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
|
|
38
|
+
{}
|
|
39
|
+
|
|
40
|
+
template<typename A>
|
|
41
|
+
compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
|
|
42
|
+
if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
|
|
43
|
+
if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
|
|
44
|
+
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
|
|
45
|
+
|
|
46
|
+
const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
|
|
47
|
+
vector_u64<A> keys;
|
|
48
|
+
bool is_empty = a.is_empty();
|
|
49
|
+
|
|
50
|
+
if (b.get_num_retained() == 0) {
|
|
51
|
+
std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
|
|
52
|
+
} else {
|
|
53
|
+
if (a.is_ordered() && b.is_ordered()) { // sort-based
|
|
54
|
+
std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
|
|
55
|
+
} else { // hash-based
|
|
56
|
+
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
|
|
57
|
+
vector_u64<A> b_hash_table(1 << lg_size, 0);
|
|
58
|
+
for (auto key: b) {
|
|
59
|
+
if (key < theta) {
|
|
60
|
+
update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
|
|
61
|
+
} else if (b.is_ordered()) {
|
|
62
|
+
break; // early stop
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// scan A lookup B
|
|
67
|
+
for (auto key: a) {
|
|
68
|
+
if (key < theta) {
|
|
69
|
+
if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
|
|
70
|
+
} else if (a.is_ordered()) {
|
|
71
|
+
break; // early stop
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
|
|
77
|
+
if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
|
|
78
|
+
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
} /* namespace datasketches */
|
|
82
|
+
|
|
83
|
+
# endif
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef THETA_INTERSECTION_HPP_
|
|
21
|
+
#define THETA_INTERSECTION_HPP_
|
|
22
|
+
|
|
23
|
+
#include <memory>
|
|
24
|
+
#include <functional>
|
|
25
|
+
#include <climits>
|
|
26
|
+
|
|
27
|
+
#include "theta_sketch.hpp"
|
|
28
|
+
#include "common_defs.hpp"
|
|
29
|
+
|
|
30
|
+
namespace datasketches {
|
|
31
|
+
|
|
32
|
+
/*
|
|
33
|
+
* author Alexander Saydakov
|
|
34
|
+
* author Lee Rhodes
|
|
35
|
+
* author Kevin Lang
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
template<typename A>
|
|
39
|
+
class theta_intersection_alloc {
|
|
40
|
+
public:
|
|
41
|
+
/**
|
|
42
|
+
* Creates an instance of the intersection with a given hash seed.
|
|
43
|
+
* @param seed hash seed
|
|
44
|
+
*/
|
|
45
|
+
explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED);
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Updates the intersection with a given sketch.
|
|
49
|
+
* The intersection can be viewed as starting from the "universe" set, and every update
|
|
50
|
+
* can reduce the current set to leave the overlapping subset only.
|
|
51
|
+
* @param sketch represents input set for the intersection
|
|
52
|
+
*/
|
|
53
|
+
void update(const theta_sketch_alloc<A>& sketch);
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Produces a copy of the current state of the intersection.
|
|
57
|
+
* If update() was not called, the state is the infinite "universe",
|
|
58
|
+
* which is considered an undefined state, and throws an exception.
|
|
59
|
+
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
60
|
+
* @return the result of the intersection
|
|
61
|
+
*/
|
|
62
|
+
compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Returns true if the state of the intersection is defined (not infinite "universe").
|
|
66
|
+
* @return true if the state is valid
|
|
67
|
+
*/
|
|
68
|
+
bool has_result() const;
|
|
69
|
+
|
|
70
|
+
private:
|
|
71
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
|
|
72
|
+
bool is_valid_;
|
|
73
|
+
bool is_empty_;
|
|
74
|
+
uint64_t theta_;
|
|
75
|
+
uint8_t lg_size_;
|
|
76
|
+
vector_u64<A> keys_;
|
|
77
|
+
uint32_t num_keys_;
|
|
78
|
+
uint16_t seed_hash_;
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
// alias with default allocator for convenience
|
|
82
|
+
typedef theta_intersection_alloc<std::allocator<void>> theta_intersection;
|
|
83
|
+
|
|
84
|
+
} /* namespace datasketches */
|
|
85
|
+
|
|
86
|
+
#include "theta_intersection_impl.hpp"
|
|
87
|
+
|
|
88
|
+
# endif
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef THETA_INTERSECTION_IMPL_HPP_
|
|
21
|
+
#define THETA_INTERSECTION_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <algorithm>
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
/*
|
|
28
|
+
* author Alexander Saydakov
|
|
29
|
+
* author Lee Rhodes
|
|
30
|
+
* author Kevin Lang
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
template<typename A>
|
|
34
|
+
theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed):
|
|
35
|
+
is_valid_(false),
|
|
36
|
+
is_empty_(false),
|
|
37
|
+
theta_(theta_sketch_alloc<A>::MAX_THETA),
|
|
38
|
+
lg_size_(0),
|
|
39
|
+
keys_(),
|
|
40
|
+
num_keys_(0),
|
|
41
|
+
seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
|
|
42
|
+
{}
|
|
43
|
+
|
|
44
|
+
template<typename A>
|
|
45
|
+
void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
|
|
46
|
+
if (is_empty_) return;
|
|
47
|
+
if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
|
|
48
|
+
is_empty_ |= sketch.is_empty();
|
|
49
|
+
theta_ = std::min(theta_, sketch.get_theta64());
|
|
50
|
+
if (is_valid_ && num_keys_ == 0) return;
|
|
51
|
+
if (sketch.get_num_retained() == 0) {
|
|
52
|
+
is_valid_ = true;
|
|
53
|
+
if (keys_.size() > 0) {
|
|
54
|
+
keys_.resize(0);
|
|
55
|
+
lg_size_ = 0;
|
|
56
|
+
num_keys_ = 0;
|
|
57
|
+
}
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
if (!is_valid_) { // first update, clone incoming sketch
|
|
61
|
+
is_valid_ = true;
|
|
62
|
+
lg_size_ = lg_size_from_count(sketch.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
|
|
63
|
+
keys_.resize(1 << lg_size_, 0);
|
|
64
|
+
for (auto key: sketch) {
|
|
65
|
+
if (!update_theta_sketch_alloc<A>::hash_search_or_insert(key, keys_.data(), lg_size_)) {
|
|
66
|
+
throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
|
|
67
|
+
}
|
|
68
|
+
++num_keys_;
|
|
69
|
+
}
|
|
70
|
+
if (num_keys_ != sketch.get_num_retained()) throw std::invalid_argument("num keys mismatch, possibly corrupted input sketch");
|
|
71
|
+
} else { // intersection
|
|
72
|
+
const uint32_t max_matches = std::min(num_keys_, sketch.get_num_retained());
|
|
73
|
+
vector_u64<A> matched_keys(max_matches);
|
|
74
|
+
uint32_t match_count = 0;
|
|
75
|
+
uint32_t count = 0;
|
|
76
|
+
for (auto key: sketch) {
|
|
77
|
+
if (key < theta_) {
|
|
78
|
+
if (update_theta_sketch_alloc<A>::hash_search(key, keys_.data(), lg_size_)) {
|
|
79
|
+
if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
|
|
80
|
+
matched_keys[match_count++] = key;
|
|
81
|
+
}
|
|
82
|
+
} else if (sketch.is_ordered()) {
|
|
83
|
+
break; // early stop
|
|
84
|
+
}
|
|
85
|
+
++count;
|
|
86
|
+
}
|
|
87
|
+
if (count > sketch.get_num_retained()) {
|
|
88
|
+
throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
|
|
89
|
+
} else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
|
|
90
|
+
throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
|
|
91
|
+
}
|
|
92
|
+
if (match_count == 0) {
|
|
93
|
+
keys_.resize(0);
|
|
94
|
+
lg_size_ = 0;
|
|
95
|
+
num_keys_ = 0;
|
|
96
|
+
if (theta_ == theta_sketch_alloc<A>::MAX_THETA) is_empty_ = true;
|
|
97
|
+
} else {
|
|
98
|
+
const uint8_t lg_size = lg_size_from_count(match_count, update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
|
|
99
|
+
if (lg_size != lg_size_) {
|
|
100
|
+
lg_size_ = lg_size;
|
|
101
|
+
keys_.resize(1 << lg_size_);
|
|
102
|
+
}
|
|
103
|
+
std::fill(keys_.begin(), keys_.end(), 0);
|
|
104
|
+
for (uint32_t i = 0; i < match_count; i++) {
|
|
105
|
+
update_theta_sketch_alloc<A>::hash_search_or_insert(matched_keys[i], keys_.data(), lg_size_);
|
|
106
|
+
}
|
|
107
|
+
num_keys_ = match_count;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
template<typename A>
|
|
113
|
+
compact_theta_sketch_alloc<A> theta_intersection_alloc<A>::get_result(bool ordered) const {
|
|
114
|
+
if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
|
|
115
|
+
vector_u64<A> keys(num_keys_);
|
|
116
|
+
if (num_keys_ > 0) {
|
|
117
|
+
std::copy_if(keys_.begin(), keys_.end(), keys.begin(), [](uint64_t key) { return key != 0; });
|
|
118
|
+
if (ordered) std::sort(keys.begin(), keys.end());
|
|
119
|
+
}
|
|
120
|
+
return compact_theta_sketch_alloc<A>(is_empty_, theta_, std::move(keys), seed_hash_, ordered);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
template<typename A>
|
|
124
|
+
bool theta_intersection_alloc<A>::has_result() const {
|
|
125
|
+
return is_valid_;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
} /* namespace datasketches */
|
|
129
|
+
|
|
130
|
+
# endif
|
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef THETA_SKETCH_HPP_
|
|
21
|
+
#define THETA_SKETCH_HPP_
|
|
22
|
+
|
|
23
|
+
#include <memory>
|
|
24
|
+
#include <functional>
|
|
25
|
+
#include <climits>
|
|
26
|
+
#include <vector>
|
|
27
|
+
|
|
28
|
+
#include "common_defs.hpp"
|
|
29
|
+
|
|
30
|
+
namespace datasketches {
|
|
31
|
+
|
|
32
|
+
/*
|
|
33
|
+
* author Alexander Saydakov
|
|
34
|
+
* author Lee Rhodes
|
|
35
|
+
* author Kevin Lang
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
// forward-declarations
|
|
39
|
+
template<typename A> class theta_sketch_alloc;
|
|
40
|
+
template<typename A> class update_theta_sketch_alloc;
|
|
41
|
+
template<typename A> class compact_theta_sketch_alloc;
|
|
42
|
+
template<typename A> class theta_union_alloc;
|
|
43
|
+
template<typename A> class theta_intersection_alloc;
|
|
44
|
+
template<typename A> class theta_a_not_b_alloc;
|
|
45
|
+
|
|
46
|
+
// for serialization as raw bytes
|
|
47
|
+
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
48
|
+
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
|
49
|
+
|
|
50
|
+
template<typename A>
|
|
51
|
+
class theta_sketch_alloc {
|
|
52
|
+
public:
|
|
53
|
+
static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
|
|
54
|
+
static const uint8_t SERIAL_VERSION = 3;
|
|
55
|
+
|
|
56
|
+
virtual ~theta_sketch_alloc() = default;
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* @return true if this sketch represents an empty set (not the same as no retained entries!)
|
|
60
|
+
*/
|
|
61
|
+
bool is_empty() const;
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* @return estimate of the distinct count of the input stream
|
|
65
|
+
*/
|
|
66
|
+
double get_estimate() const;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Returns the approximate lower error bound given a number of standard deviations.
|
|
70
|
+
* This parameter is similar to the number of standard deviations of the normal distribution
|
|
71
|
+
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
|
|
72
|
+
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
|
|
73
|
+
* @return the lower bound
|
|
74
|
+
*/
|
|
75
|
+
double get_lower_bound(uint8_t num_std_devs) const;
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Returns the approximate upper error bound given a number of standard deviations.
|
|
79
|
+
* This parameter is similar to the number of standard deviations of the normal distribution
|
|
80
|
+
* and corresponds to approximately 67%, 95% and 99% confidence intervals.
|
|
81
|
+
* @param num_std_devs number of Standard Deviations (1, 2 or 3)
|
|
82
|
+
* @return the upper bound
|
|
83
|
+
*/
|
|
84
|
+
double get_upper_bound(uint8_t num_std_devs) const;
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* @return true if the sketch is in estimation mode (as opposed to exact mode)
|
|
88
|
+
*/
|
|
89
|
+
bool is_estimation_mode() const;
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* @return theta as a fraction from 0 to 1 (effective sampling rate)
|
|
93
|
+
*/
|
|
94
|
+
double get_theta() const;
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* @return theta as a positive integer between 0 and LLONG_MAX
|
|
98
|
+
*/
|
|
99
|
+
uint64_t get_theta64() const;
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* @return the number of retained entries in the sketch
|
|
103
|
+
*/
|
|
104
|
+
virtual uint32_t get_num_retained() const = 0;
|
|
105
|
+
|
|
106
|
+
virtual uint16_t get_seed_hash() const = 0;
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* @return true if retained entries are ordered
|
|
110
|
+
*/
|
|
111
|
+
virtual bool is_ordered() const = 0;
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Writes a human-readable summary of this sketch to a given stream
|
|
115
|
+
* @param print_items if true include the list of items retained by the sketch
|
|
116
|
+
*/
|
|
117
|
+
virtual string<A> to_string(bool print_items = false) const = 0;
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* This method serializes the sketch into a given stream in a binary form
|
|
121
|
+
* @param os output stream
|
|
122
|
+
*/
|
|
123
|
+
virtual void serialize(std::ostream& os) const = 0;
|
|
124
|
+
|
|
125
|
+
// This is a convenience alias for users
|
|
126
|
+
// The type returned by the following serialize method
|
|
127
|
+
typedef vector_u8<A> vector_bytes;
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* This method serializes the sketch as a vector of bytes.
|
|
131
|
+
* An optional header can be reserved in front of the sketch.
|
|
132
|
+
* It is an uninitialized space of a given size.
|
|
133
|
+
* This header is used in Datasketches PostgreSQL extension.
|
|
134
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
|
135
|
+
*/
|
|
136
|
+
virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
|
|
137
|
+
|
|
138
|
+
// This is a convenience alias for users
|
|
139
|
+
// The type returned by the following deserialize methods
|
|
140
|
+
// It is not possible to return instances of an abstract type, so this has to be a pointer
|
|
141
|
+
typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* This method deserializes a sketch from a given stream.
|
|
145
|
+
* @param is input stream
|
|
146
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
147
|
+
* @return an instance of a sketch as a unique_ptr
|
|
148
|
+
*/
|
|
149
|
+
static unique_ptr deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* This method deserializes a sketch from a given array of bytes.
|
|
153
|
+
* @param bytes pointer to the array of bytes
|
|
154
|
+
* @param size the size of the array
|
|
155
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
156
|
+
* @return an instance of the sketch
|
|
157
|
+
*/
|
|
158
|
+
static unique_ptr deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
|
|
159
|
+
|
|
160
|
+
class const_iterator;
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Iterator over hash values in this sketch.
|
|
164
|
+
* @return begin iterator
|
|
165
|
+
*/
|
|
166
|
+
virtual const_iterator begin() const = 0;
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Iterator pointing past the valid range.
|
|
170
|
+
* Not to be incremented or dereferenced.
|
|
171
|
+
* @return end iterator
|
|
172
|
+
*/
|
|
173
|
+
virtual const_iterator end() const = 0;
|
|
174
|
+
|
|
175
|
+
protected:
|
|
176
|
+
enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
|
|
177
|
+
|
|
178
|
+
bool is_empty_;
|
|
179
|
+
uint64_t theta_;
|
|
180
|
+
|
|
181
|
+
theta_sketch_alloc(bool is_empty, uint64_t theta);
|
|
182
|
+
|
|
183
|
+
static uint16_t get_seed_hash(uint64_t seed);
|
|
184
|
+
|
|
185
|
+
static void check_sketch_type(uint8_t actual, uint8_t expected);
|
|
186
|
+
static void check_serial_version(uint8_t actual, uint8_t expected);
|
|
187
|
+
static void check_seed_hash(uint16_t actual, uint16_t expected);
|
|
188
|
+
|
|
189
|
+
friend theta_intersection_alloc<A>;
|
|
190
|
+
friend theta_a_not_b_alloc<A>;
|
|
191
|
+
};
|
|
192
|
+
|
|
193
|
+
// update sketch
|
|
194
|
+
|
|
195
|
+
template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
|
|
196
|
+
template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
|
|
197
|
+
|
|
198
|
+
template<typename A>
|
|
199
|
+
class update_theta_sketch_alloc: public theta_sketch_alloc<A> {
|
|
200
|
+
public:
|
|
201
|
+
class builder;
|
|
202
|
+
enum resize_factor { X1, X2, X4, X8 };
|
|
203
|
+
static const uint8_t SKETCH_TYPE = 2;
|
|
204
|
+
|
|
205
|
+
// No constructor here. Use builder instead.
|
|
206
|
+
|
|
207
|
+
virtual ~update_theta_sketch_alloc() = default;
|
|
208
|
+
|
|
209
|
+
virtual uint32_t get_num_retained() const;
|
|
210
|
+
virtual uint16_t get_seed_hash() const;
|
|
211
|
+
virtual bool is_ordered() const;
|
|
212
|
+
virtual string<A> to_string(bool print_items = false) const;
|
|
213
|
+
virtual void serialize(std::ostream& os) const;
|
|
214
|
+
typedef vector_u8<A> vector_bytes; // alias for users
|
|
215
|
+
// header space is reserved, but not initialized
|
|
216
|
+
virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Update this sketch with a given string.
|
|
220
|
+
* @param value string to update the sketch with
|
|
221
|
+
*/
|
|
222
|
+
void update(const std::string& value);
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Update this sketch with a given unsigned 64-bit integer.
|
|
226
|
+
* @param value uint64_t to update the sketch with
|
|
227
|
+
*/
|
|
228
|
+
void update(uint64_t value);
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Update this sketch with a given signed 64-bit integer.
|
|
232
|
+
* @param value int64_t to update the sketch with
|
|
233
|
+
*/
|
|
234
|
+
void update(int64_t value);
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Update this sketch with a given unsigned 32-bit integer.
|
|
238
|
+
* For compatibility with Java implementation.
|
|
239
|
+
* @param value uint32_t to update the sketch with
|
|
240
|
+
*/
|
|
241
|
+
void update(uint32_t value);
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Update this sketch with a given signed 32-bit integer.
|
|
245
|
+
* For compatibility with Java implementation.
|
|
246
|
+
* @param value int32_t to update the sketch with
|
|
247
|
+
*/
|
|
248
|
+
void update(int32_t value);
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Update this sketch with a given unsigned 16-bit integer.
|
|
252
|
+
* For compatibility with Java implementation.
|
|
253
|
+
* @param value uint16_t to update the sketch with
|
|
254
|
+
*/
|
|
255
|
+
void update(uint16_t value);
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Update this sketch with a given signed 16-bit integer.
|
|
259
|
+
* For compatibility with Java implementation.
|
|
260
|
+
* @param value int16_t to update the sketch with
|
|
261
|
+
*/
|
|
262
|
+
void update(int16_t value);
|
|
263
|
+
|
|
264
|
+
/**
|
|
265
|
+
* Update this sketch with a given unsigned 8-bit integer.
|
|
266
|
+
* For compatibility with Java implementation.
|
|
267
|
+
* @param value uint8_t to update the sketch with
|
|
268
|
+
*/
|
|
269
|
+
void update(uint8_t value);
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Update this sketch with a given signed 8-bit integer.
|
|
273
|
+
* For compatibility with Java implementation.
|
|
274
|
+
* @param value int8_t to update the sketch with
|
|
275
|
+
*/
|
|
276
|
+
void update(int8_t value);
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Update this sketch with a given double-precision floating point value.
|
|
280
|
+
* For compatibility with Java implementation.
|
|
281
|
+
* @param value double to update the sketch with
|
|
282
|
+
*/
|
|
283
|
+
void update(double value);
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Update this sketch with a given floating point value.
|
|
287
|
+
* For compatibility with Java implementation.
|
|
288
|
+
* @param value float to update the sketch with
|
|
289
|
+
*/
|
|
290
|
+
void update(float value);
|
|
291
|
+
|
|
292
|
+
/**
|
|
293
|
+
* Update this sketch with given data of any type.
|
|
294
|
+
* This is a "universal" update that covers all cases above,
|
|
295
|
+
* but may produce different hashes.
|
|
296
|
+
* Be very careful to hash input values consistently using the same approach
|
|
297
|
+
* both over time and on different platforms
|
|
298
|
+
* and while passing sketches between C++ environment and Java environment.
|
|
299
|
+
* Otherwise two sketches that should represent overlapping sets will be disjoint
|
|
300
|
+
* For instance, for signed 32-bit values call update(int32_t) method above,
|
|
301
|
+
* which does widening conversion to int64_t, if compatibility with Java is expected
|
|
302
|
+
* @param data pointer to the data
|
|
303
|
+
* @param length of the data in bytes
|
|
304
|
+
*/
|
|
305
|
+
void update(const void* data, unsigned length);
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Remove retained entries in excess of the nominal size k (if any)
|
|
309
|
+
*/
|
|
310
|
+
void trim();
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Converts this sketch to a compact sketch (ordered or unordered).
|
|
314
|
+
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
315
|
+
* @return compact sketch
|
|
316
|
+
*/
|
|
317
|
+
compact_theta_sketch_alloc<A> compact(bool ordered = true) const;
|
|
318
|
+
|
|
319
|
+
virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
|
|
320
|
+
virtual typename theta_sketch_alloc<A>::const_iterator end() const;
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* This method deserializes a sketch from a given stream.
|
|
324
|
+
* @param is input stream
|
|
325
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
326
|
+
* @return an instance of a sketch
|
|
327
|
+
*/
|
|
328
|
+
static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
|
|
329
|
+
|
|
330
|
+
/**
|
|
331
|
+
* This method deserializes a sketch from a given array of bytes.
|
|
332
|
+
* @param bytes pointer to the array of bytes
|
|
333
|
+
* @param size the size of the array
|
|
334
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
335
|
+
* @return an instance of the sketch
|
|
336
|
+
*/
|
|
337
|
+
static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
|
|
338
|
+
|
|
339
|
+
private:
|
|
340
|
+
// resize threshold = 0.5 tuned for speed
|
|
341
|
+
static constexpr double RESIZE_THRESHOLD = 0.5;
|
|
342
|
+
// hash table rebuild threshold = 15/16
|
|
343
|
+
static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
|
|
344
|
+
|
|
345
|
+
static constexpr uint8_t STRIDE_HASH_BITS = 7;
|
|
346
|
+
static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
|
|
347
|
+
|
|
348
|
+
uint8_t lg_cur_size_;
|
|
349
|
+
uint8_t lg_nom_size_;
|
|
350
|
+
vector_u64<A> keys_;
|
|
351
|
+
uint32_t num_keys_;
|
|
352
|
+
resize_factor rf_;
|
|
353
|
+
float p_;
|
|
354
|
+
uint64_t seed_;
|
|
355
|
+
uint32_t capacity_;
|
|
356
|
+
|
|
357
|
+
// for builder
|
|
358
|
+
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
|
|
359
|
+
|
|
360
|
+
// for deserialize
|
|
361
|
+
update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
|
|
362
|
+
|
|
363
|
+
void resize();
|
|
364
|
+
void rebuild();
|
|
365
|
+
|
|
366
|
+
friend theta_union_alloc<A>;
|
|
367
|
+
void internal_update(uint64_t hash);
|
|
368
|
+
|
|
369
|
+
friend theta_intersection_alloc<A>;
|
|
370
|
+
friend theta_a_not_b_alloc<A>;
|
|
371
|
+
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
|
|
372
|
+
static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
|
|
373
|
+
static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
|
|
374
|
+
static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
|
|
375
|
+
|
|
376
|
+
friend theta_sketch_alloc<A>;
|
|
377
|
+
static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
|
|
378
|
+
static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
|
|
379
|
+
};
|
|
380
|
+
|
|
381
|
+
// compact sketch
|
|
382
|
+
|
|
383
|
+
template<typename A>
|
|
384
|
+
class compact_theta_sketch_alloc: public theta_sketch_alloc<A> {
|
|
385
|
+
public:
|
|
386
|
+
static const uint8_t SKETCH_TYPE = 3;
|
|
387
|
+
|
|
388
|
+
// No constructor here.
|
|
389
|
+
// Instances of this type can be obtained:
|
|
390
|
+
// - by compacting an update_theta_sketch
|
|
391
|
+
// - as a result of a set operation
|
|
392
|
+
// - by deserializing a previously serialized compact sketch
|
|
393
|
+
|
|
394
|
+
compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered);
|
|
395
|
+
virtual ~compact_theta_sketch_alloc() = default;
|
|
396
|
+
|
|
397
|
+
virtual uint32_t get_num_retained() const;
|
|
398
|
+
virtual uint16_t get_seed_hash() const;
|
|
399
|
+
virtual bool is_ordered() const;
|
|
400
|
+
virtual string<A> to_string(bool print_items = false) const;
|
|
401
|
+
virtual void serialize(std::ostream& os) const;
|
|
402
|
+
typedef vector_u8<A> vector_bytes; // alias for users
|
|
403
|
+
// header space is reserved, but not initialized
|
|
404
|
+
virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
405
|
+
|
|
406
|
+
virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
|
|
407
|
+
virtual typename theta_sketch_alloc<A>::const_iterator end() const;
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* This method deserializes a sketch from a given stream.
|
|
411
|
+
* @param is input stream
|
|
412
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
413
|
+
* @return an instance of a sketch
|
|
414
|
+
*/
|
|
415
|
+
static compact_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* This method deserializes a sketch from a given array of bytes.
|
|
419
|
+
* @param bytes pointer to the array of bytes
|
|
420
|
+
* @param size the size of the array
|
|
421
|
+
* @param seed the seed for the hash function that was used to create the sketch
|
|
422
|
+
* @return an instance of the sketch
|
|
423
|
+
*/
|
|
424
|
+
static compact_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
|
|
425
|
+
|
|
426
|
+
private:
|
|
427
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
|
|
428
|
+
|
|
429
|
+
vector_u64<A> keys_;
|
|
430
|
+
uint16_t seed_hash_;
|
|
431
|
+
bool is_ordered_;
|
|
432
|
+
|
|
433
|
+
friend theta_sketch_alloc<A>;
|
|
434
|
+
friend update_theta_sketch_alloc<A>;
|
|
435
|
+
friend theta_union_alloc<A>;
|
|
436
|
+
friend theta_intersection_alloc<A>;
|
|
437
|
+
friend theta_a_not_b_alloc<A>;
|
|
438
|
+
compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
|
|
439
|
+
static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
|
|
440
|
+
static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
|
|
441
|
+
};
|
|
442
|
+
|
|
443
|
+
// builder
|
|
444
|
+
|
|
445
|
+
template<typename A>
|
|
446
|
+
class update_theta_sketch_alloc<A>::builder {
|
|
447
|
+
public:
|
|
448
|
+
static const uint8_t MIN_LG_K = 5;
|
|
449
|
+
static const uint8_t DEFAULT_LG_K = 12;
|
|
450
|
+
static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Creates and instance of the builder with default parameters.
|
|
454
|
+
*/
|
|
455
|
+
builder();
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Set log2(k), where k is a nominal number of entries in the sketch
|
|
459
|
+
* @param lg_k base 2 logarithm of nominal number of entries
|
|
460
|
+
* @return this builder
|
|
461
|
+
*/
|
|
462
|
+
builder& set_lg_k(uint8_t lg_k);
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Set resize factor for the internal hash table (defaults to 8)
|
|
466
|
+
* @param rf resize factor
|
|
467
|
+
* @return this builder
|
|
468
|
+
*/
|
|
469
|
+
builder& set_resize_factor(resize_factor rf);
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* Set sampling probability (initial theta). The default is 1, so the sketch retains
|
|
473
|
+
* all entries until it reaches the limit, at which point it goes into the estimation mode
|
|
474
|
+
* and reduces the effective sampling probability (theta) as necessary.
|
|
475
|
+
* @param p sampling probability
|
|
476
|
+
* @return this builder
|
|
477
|
+
*/
|
|
478
|
+
builder& set_p(float p);
|
|
479
|
+
|
|
480
|
+
/**
|
|
481
|
+
* Set the seed for the hash function. Should be used carefully if needed.
|
|
482
|
+
* Sketches produced with different seed are not compatible
|
|
483
|
+
* and cannot be mixed in set operations.
|
|
484
|
+
* @param seed hash seed
|
|
485
|
+
* @return this builder
|
|
486
|
+
*/
|
|
487
|
+
builder& set_seed(uint64_t seed);
|
|
488
|
+
|
|
489
|
+
/**
|
|
490
|
+
* This is to create an instance of the sketch with predefined parameters.
|
|
491
|
+
* @return and instance of the sketch
|
|
492
|
+
*/
|
|
493
|
+
update_theta_sketch_alloc<A> build() const;
|
|
494
|
+
|
|
495
|
+
private:
|
|
496
|
+
uint8_t lg_k_;
|
|
497
|
+
resize_factor rf_;
|
|
498
|
+
float p_;
|
|
499
|
+
uint64_t seed_;
|
|
500
|
+
|
|
501
|
+
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
|
|
502
|
+
};
|
|
503
|
+
|
|
504
|
+
// iterator
|
|
505
|
+
template<typename A>
|
|
506
|
+
class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
|
|
507
|
+
public:
|
|
508
|
+
const_iterator& operator++();
|
|
509
|
+
const_iterator operator++(int);
|
|
510
|
+
bool operator==(const const_iterator& other) const;
|
|
511
|
+
bool operator!=(const const_iterator& other) const;
|
|
512
|
+
uint64_t operator*() const;
|
|
513
|
+
|
|
514
|
+
private:
|
|
515
|
+
const uint64_t* keys_;
|
|
516
|
+
uint32_t size_;
|
|
517
|
+
uint32_t index_;
|
|
518
|
+
const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
|
|
519
|
+
friend class update_theta_sketch_alloc<A>;
|
|
520
|
+
friend class compact_theta_sketch_alloc<A>;
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
// aliases with default allocator for convenience
|
|
525
|
+
typedef theta_sketch_alloc<std::allocator<void>> theta_sketch;
|
|
526
|
+
typedef update_theta_sketch_alloc<std::allocator<void>> update_theta_sketch;
|
|
527
|
+
typedef compact_theta_sketch_alloc<std::allocator<void>> compact_theta_sketch;
|
|
528
|
+
|
|
529
|
+
} /* namespace datasketches */
|
|
530
|
+
|
|
531
|
+
#include "theta_sketch_impl.hpp"
|
|
532
|
+
|
|
533
|
+
#endif
|