datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
namespace datasketches {
|
|
21
|
+
|
|
22
|
+
template<typename A>
|
|
23
|
+
update_array_of_doubles_sketch_alloc<A>::update_array_of_doubles_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
24
|
+
uint64_t theta, uint64_t seed, const array_of_doubles_update_policy<A>& policy, const A& allocator):
|
|
25
|
+
Base(lg_cur_size, lg_nom_size, rf, theta, seed, policy, allocator) {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
template<typename A>
|
|
29
|
+
uint8_t update_array_of_doubles_sketch_alloc<A>::get_num_values() const {
|
|
30
|
+
return this->policy_.get_num_values();
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
template<typename A>
|
|
34
|
+
compact_array_of_doubles_sketch_alloc<A> update_array_of_doubles_sketch_alloc<A>::compact(bool ordered) const {
|
|
35
|
+
return compact_array_of_doubles_sketch_alloc<A>(*this, ordered);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// builder
|
|
39
|
+
|
|
40
|
+
template<typename A>
|
|
41
|
+
update_array_of_doubles_sketch_alloc<A>::builder::builder(const array_of_doubles_update_policy<A>& policy, const A& allocator):
|
|
42
|
+
tuple_base_builder<builder, array_of_doubles_update_policy<A>, A>(policy, allocator) {}
|
|
43
|
+
|
|
44
|
+
template<typename A>
|
|
45
|
+
update_array_of_doubles_sketch_alloc<A> update_array_of_doubles_sketch_alloc<A>::builder::build() const {
|
|
46
|
+
return update_array_of_doubles_sketch_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// compact sketch
|
|
50
|
+
|
|
51
|
+
template<typename A>
|
|
52
|
+
template<typename S>
|
|
53
|
+
compact_array_of_doubles_sketch_alloc<A>::compact_array_of_doubles_sketch_alloc(const S& other, bool ordered):
|
|
54
|
+
Base(other, ordered), num_values_(other.get_num_values()) {}
|
|
55
|
+
|
|
56
|
+
template<typename A>
|
|
57
|
+
compact_array_of_doubles_sketch_alloc<A>::compact_array_of_doubles_sketch_alloc(bool is_empty, bool is_ordered,
|
|
58
|
+
uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries, uint8_t num_values):
|
|
59
|
+
Base(is_empty, is_ordered, seed_hash, theta, std::move(entries)), num_values_(num_values) {}
|
|
60
|
+
|
|
61
|
+
template<typename A>
|
|
62
|
+
compact_array_of_doubles_sketch_alloc<A>::compact_array_of_doubles_sketch_alloc(uint8_t num_values, Base&& base):
|
|
63
|
+
Base(std::move(base)), num_values_(num_values) {}
|
|
64
|
+
|
|
65
|
+
template<typename A>
|
|
66
|
+
uint8_t compact_array_of_doubles_sketch_alloc<A>::get_num_values() const {
|
|
67
|
+
return num_values_;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
template<typename A>
|
|
71
|
+
void compact_array_of_doubles_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
72
|
+
const uint8_t preamble_longs = 1;
|
|
73
|
+
os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
|
|
74
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
|
75
|
+
os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
|
|
76
|
+
const uint8_t family = SKETCH_FAMILY;
|
|
77
|
+
os.write(reinterpret_cast<const char*>(&family), sizeof(family));
|
|
78
|
+
const uint8_t type = SKETCH_TYPE;
|
|
79
|
+
os.write(reinterpret_cast<const char*>(&type), sizeof(type));
|
|
80
|
+
const uint8_t flags_byte(
|
|
81
|
+
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
82
|
+
(this->get_num_retained() > 0 ? 1 << flags::HAS_ENTRIES : 0) |
|
|
83
|
+
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
84
|
+
);
|
|
85
|
+
os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
|
|
86
|
+
os.write(reinterpret_cast<const char*>(&num_values_), sizeof(num_values_));
|
|
87
|
+
const uint16_t seed_hash = this->get_seed_hash();
|
|
88
|
+
os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
|
|
89
|
+
os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
|
|
90
|
+
if (this->get_num_retained() > 0) {
|
|
91
|
+
const uint32_t num_entries = this->entries_.size();
|
|
92
|
+
os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
|
|
93
|
+
const uint32_t unused32 = 0;
|
|
94
|
+
os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
|
|
95
|
+
for (const auto& it: this->entries_) {
|
|
96
|
+
os.write(reinterpret_cast<const char*>(&it.first), sizeof(uint64_t));
|
|
97
|
+
}
|
|
98
|
+
for (const auto& it: this->entries_) {
|
|
99
|
+
os.write(reinterpret_cast<const char*>(it.second.data()), it.second.size() * sizeof(double));
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
template<typename A>
|
|
105
|
+
auto compact_array_of_doubles_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
|
106
|
+
const uint8_t preamble_longs = 1;
|
|
107
|
+
const size_t size = header_size_bytes + 16 // preamble and theta
|
|
108
|
+
+ (this->entries_.size() > 0 ? 8 : 0)
|
|
109
|
+
+ (sizeof(uint64_t) + sizeof(double) * num_values_) * this->entries_.size();
|
|
110
|
+
vector_bytes bytes(size, 0, this->entries_.get_allocator());
|
|
111
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
112
|
+
|
|
113
|
+
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
|
|
114
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
|
115
|
+
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
|
|
116
|
+
const uint8_t family = SKETCH_FAMILY;
|
|
117
|
+
ptr += copy_to_mem(&family, ptr, sizeof(family));
|
|
118
|
+
const uint8_t type = SKETCH_TYPE;
|
|
119
|
+
ptr += copy_to_mem(&type, ptr, sizeof(type));
|
|
120
|
+
const uint8_t flags_byte(
|
|
121
|
+
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
122
|
+
(this->get_num_retained() ? 1 << flags::HAS_ENTRIES : 0) |
|
|
123
|
+
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
124
|
+
);
|
|
125
|
+
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
|
|
126
|
+
ptr += copy_to_mem(&num_values_, ptr, sizeof(num_values_));
|
|
127
|
+
const uint16_t seed_hash = this->get_seed_hash();
|
|
128
|
+
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
|
|
129
|
+
ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
|
|
130
|
+
if (this->get_num_retained() > 0) {
|
|
131
|
+
const uint32_t num_entries = this->entries_.size();
|
|
132
|
+
ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
|
|
133
|
+
const uint32_t unused32 = 0;
|
|
134
|
+
ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
|
|
135
|
+
for (const auto& it: this->entries_) {
|
|
136
|
+
ptr += copy_to_mem(&it.first, ptr, sizeof(uint64_t));
|
|
137
|
+
}
|
|
138
|
+
for (const auto& it: this->entries_) {
|
|
139
|
+
ptr += copy_to_mem(it.second.data(), ptr, it.second.size() * sizeof(double));
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return bytes;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
template<typename A>
|
|
146
|
+
compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
|
|
147
|
+
uint8_t preamble_longs;
|
|
148
|
+
is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
|
|
149
|
+
uint8_t serial_version;
|
|
150
|
+
is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
|
|
151
|
+
uint8_t family;
|
|
152
|
+
is.read(reinterpret_cast<char*>(&family), sizeof(family));
|
|
153
|
+
uint8_t type;
|
|
154
|
+
is.read(reinterpret_cast<char*>(&type), sizeof(type));
|
|
155
|
+
uint8_t flags_byte;
|
|
156
|
+
is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
|
|
157
|
+
uint8_t num_values;
|
|
158
|
+
is.read(reinterpret_cast<char*>(&num_values), sizeof(num_values));
|
|
159
|
+
uint16_t seed_hash;
|
|
160
|
+
is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
|
|
161
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
162
|
+
checker<true>::check_sketch_family(family, SKETCH_FAMILY);
|
|
163
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
164
|
+
const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
|
|
165
|
+
if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
166
|
+
|
|
167
|
+
uint64_t theta;
|
|
168
|
+
is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
|
|
169
|
+
std::vector<Entry, AllocEntry> entries(allocator);
|
|
170
|
+
if (has_entries) {
|
|
171
|
+
uint32_t num_entries;
|
|
172
|
+
is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
|
|
173
|
+
uint32_t unused32;
|
|
174
|
+
is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
|
|
175
|
+
entries.reserve(num_entries);
|
|
176
|
+
std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
|
|
177
|
+
is.read(reinterpret_cast<char*>(keys.data()), num_entries * sizeof(uint64_t));
|
|
178
|
+
for (size_t i = 0; i < num_entries; ++i) {
|
|
179
|
+
aod<A> summary(num_values, allocator);
|
|
180
|
+
is.read(reinterpret_cast<char*>(summary.data()), num_values * sizeof(double));
|
|
181
|
+
entries.push_back(Entry(keys[i], std::move(summary)));
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
185
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
186
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
|
187
|
+
return compact_array_of_doubles_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries), num_values);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
template<typename A>
|
|
191
|
+
compact_array_of_doubles_sketch_alloc<A> compact_array_of_doubles_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
|
|
192
|
+
ensure_minimum_memory(size, 16);
|
|
193
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
194
|
+
uint8_t preamble_longs;
|
|
195
|
+
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
196
|
+
uint8_t serial_version;
|
|
197
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
|
198
|
+
uint8_t family;
|
|
199
|
+
ptr += copy_from_mem(ptr, &family, sizeof(family));
|
|
200
|
+
uint8_t type;
|
|
201
|
+
ptr += copy_from_mem(ptr, &type, sizeof(type));
|
|
202
|
+
uint8_t flags_byte;
|
|
203
|
+
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
|
204
|
+
uint8_t num_values;
|
|
205
|
+
ptr += copy_from_mem(ptr, &num_values, sizeof(num_values));
|
|
206
|
+
uint16_t seed_hash;
|
|
207
|
+
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
|
208
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
209
|
+
checker<true>::check_sketch_family(family, SKETCH_FAMILY);
|
|
210
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
211
|
+
const bool has_entries = flags_byte & (1 << flags::HAS_ENTRIES);
|
|
212
|
+
if (has_entries) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
213
|
+
|
|
214
|
+
uint64_t theta;
|
|
215
|
+
ptr += copy_from_mem(ptr, &theta, sizeof(theta));
|
|
216
|
+
std::vector<Entry, AllocEntry> entries(allocator);
|
|
217
|
+
if (has_entries) {
|
|
218
|
+
ensure_minimum_memory(size, 24);
|
|
219
|
+
uint32_t num_entries;
|
|
220
|
+
ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
|
|
221
|
+
uint32_t unused32;
|
|
222
|
+
ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
|
|
223
|
+
ensure_minimum_memory(size, 24 + (sizeof(uint64_t) + sizeof(double) * num_values) * num_entries);
|
|
224
|
+
entries.reserve(num_entries);
|
|
225
|
+
std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
|
|
226
|
+
ptr += copy_from_mem(ptr, keys.data(), sizeof(uint64_t) * num_entries);
|
|
227
|
+
for (size_t i = 0; i < num_entries; ++i) {
|
|
228
|
+
aod<A> summary(num_values, allocator);
|
|
229
|
+
ptr += copy_from_mem(ptr, summary.data(), num_values * sizeof(double));
|
|
230
|
+
entries.push_back(Entry(keys[i], std::move(summary)));
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
234
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
|
235
|
+
return compact_array_of_doubles_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries), num_values);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef ARRAY_OF_DOUBLES_UNION_HPP_
|
|
21
|
+
#define ARRAY_OF_DOUBLES_UNION_HPP_
|
|
22
|
+
|
|
23
|
+
#include <vector>
|
|
24
|
+
#include <memory>
|
|
25
|
+
|
|
26
|
+
#include "array_of_doubles_sketch.hpp"
|
|
27
|
+
#include "tuple_union.hpp"
|
|
28
|
+
|
|
29
|
+
namespace datasketches {
|
|
30
|
+
|
|
31
|
+
template<typename A = std::allocator<double>>
|
|
32
|
+
struct array_of_doubles_union_policy_alloc {
|
|
33
|
+
array_of_doubles_union_policy_alloc(uint8_t num_values = 1): num_values_(num_values) {}
|
|
34
|
+
|
|
35
|
+
void operator()(aod<A>& summary, const aod<A>& other) const {
|
|
36
|
+
for (size_t i = 0; i < summary.size(); ++i) {
|
|
37
|
+
summary[i] += other[i];
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
uint8_t get_num_values() const {
|
|
42
|
+
return num_values_;
|
|
43
|
+
}
|
|
44
|
+
private:
|
|
45
|
+
uint8_t num_values_;
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
using array_of_doubles_union_policy = array_of_doubles_union_policy_alloc<>;
|
|
49
|
+
|
|
50
|
+
template<typename Allocator = std::allocator<double>>
|
|
51
|
+
class array_of_doubles_union_alloc: public tuple_union<aod<Allocator>, array_of_doubles_union_policy_alloc<Allocator>, AllocAOD<Allocator>> {
|
|
52
|
+
public:
|
|
53
|
+
using Policy = array_of_doubles_union_policy_alloc<Allocator>;
|
|
54
|
+
using Base = tuple_union<aod<Allocator>, Policy, AllocAOD<Allocator>>;
|
|
55
|
+
using CompactSketch = compact_array_of_doubles_sketch_alloc<Allocator>;
|
|
56
|
+
using resize_factor = theta_constants::resize_factor;
|
|
57
|
+
|
|
58
|
+
class builder;
|
|
59
|
+
|
|
60
|
+
CompactSketch get_result(bool ordered = true) const;
|
|
61
|
+
|
|
62
|
+
private:
|
|
63
|
+
// for builder
|
|
64
|
+
array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
template<typename Allocator>
|
|
68
|
+
class array_of_doubles_union_alloc<Allocator>::builder: public tuple_base_builder<builder, array_of_doubles_union_policy_alloc<Allocator>, Allocator> {
|
|
69
|
+
public:
|
|
70
|
+
builder(const array_of_doubles_union_policy_alloc<Allocator>& policy = array_of_doubles_union_policy_alloc<Allocator>(), const Allocator& allocator = Allocator());
|
|
71
|
+
array_of_doubles_union_alloc<Allocator> build() const;
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
// alias with default allocator
|
|
75
|
+
using array_of_doubles_union = array_of_doubles_union_alloc<>;
|
|
76
|
+
|
|
77
|
+
} /* namespace datasketches */
|
|
78
|
+
|
|
79
|
+
#include "array_of_doubles_union_impl.hpp"
|
|
80
|
+
|
|
81
|
+
#endif
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
namespace datasketches {
|
|
21
|
+
|
|
22
|
+
template<typename A>
|
|
23
|
+
array_of_doubles_union_alloc<A>::array_of_doubles_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const A& allocator):
|
|
24
|
+
Base(lg_cur_size, lg_nom_size, rf, theta, seed, policy, allocator)
|
|
25
|
+
{}
|
|
26
|
+
|
|
27
|
+
template<typename A>
|
|
28
|
+
auto array_of_doubles_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
|
|
29
|
+
return compact_array_of_doubles_sketch_alloc<A>(this->state_.get_policy().get_policy().get_num_values(), Base::get_result(ordered));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// builder
|
|
33
|
+
|
|
34
|
+
template<typename A>
|
|
35
|
+
array_of_doubles_union_alloc<A>::builder::builder(const Policy& policy, const A& allocator):
|
|
36
|
+
tuple_base_builder<builder, Policy, A>(policy, allocator) {}
|
|
37
|
+
|
|
38
|
+
template<typename A>
|
|
39
|
+
array_of_doubles_union_alloc<A> array_of_doubles_union_alloc<A>::builder::build() const {
|
|
40
|
+
return array_of_doubles_union_alloc<A>(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->policy_, this->allocator_);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef BOUNDS_ON_RATIOS_IN_SAMPLED_SETS_HPP_
|
|
21
|
+
#define BOUNDS_ON_RATIOS_IN_SAMPLED_SETS_HPP_
|
|
22
|
+
|
|
23
|
+
#include <cstdint>
|
|
24
|
+
|
|
25
|
+
#include <bounds_binomial_proportions.hpp>
|
|
26
|
+
|
|
27
|
+
namespace datasketches {
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* This class is used to compute the bounds on the estimate of the ratio <i>|B| / |A|</i>, where:
|
|
31
|
+
* <ul>
|
|
32
|
+
* <li><i>|A|</i> is the unknown size of a set <i>A</i> of unique identifiers.</li>
|
|
33
|
+
* <li><i>|B|</i> is the unknown size of a subset <i>B</i> of <i>A</i>.</li>
|
|
34
|
+
* <li><i>a</i> = <i>|S<sub>A</sub>|</i> is the observed size of a sample of <i>A</i>
|
|
35
|
+
* that was obtained by Bernoulli sampling with a known inclusion probability <i>f</i>.</li>
|
|
36
|
+
* <li><i>b</i> = <i>|S<sub>A</sub> ∩ B|</i> is the observed size of a subset
|
|
37
|
+
* of <i>S<sub>A</sub></i>.</li>
|
|
38
|
+
* </ul>
|
|
39
|
+
*/
|
|
40
|
+
class bounds_on_ratios_in_sampled_sets {
|
|
41
|
+
public:
|
|
42
|
+
static constexpr double NUM_STD_DEVS = 2.0;
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Return the approximate lower bound based on a 95% confidence interval
|
|
46
|
+
* @param a See class javadoc
|
|
47
|
+
* @param b See class javadoc
|
|
48
|
+
* @param f the inclusion probability used to produce the set with size <i>a</i> and should
|
|
49
|
+
* generally be less than 0.5. Above this value, the results not be reliable.
|
|
50
|
+
* When <i>f</i> = 1.0 this returns the estimate.
|
|
51
|
+
* @return the approximate upper bound
|
|
52
|
+
*/
|
|
53
|
+
static double lower_bound_for_b_over_a(uint64_t a, uint64_t b, double f) {
|
|
54
|
+
check_inputs(a, b, f);
|
|
55
|
+
if (a == 0) return 0.0;
|
|
56
|
+
if (f == 1.0) return static_cast<double>(b) / static_cast<double>(a);
|
|
57
|
+
return bounds_binomial_proportions::approximate_lower_bound_on_p(a, b, NUM_STD_DEVS * hacky_adjuster(f));
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Return the approximate upper bound based on a 95% confidence interval
|
|
62
|
+
* @param a See class javadoc
|
|
63
|
+
* @param b See class javadoc
|
|
64
|
+
* @param f the inclusion probability used to produce the set with size <i>a</i>.
|
|
65
|
+
* @return the approximate lower bound
|
|
66
|
+
*/
|
|
67
|
+
static double upper_bound_for_b_over_a(uint64_t a, uint64_t b, double f) {
|
|
68
|
+
check_inputs(a, b, f);
|
|
69
|
+
if (a == 0) return 1.0;
|
|
70
|
+
if (f == 1.0) return static_cast<double>(b) / static_cast<double>(a);
|
|
71
|
+
return bounds_binomial_proportions::approximate_upper_bound_on_p(a, b, NUM_STD_DEVS * hacky_adjuster(f));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Return the estimate of b over a
|
|
76
|
+
* @param a See class javadoc
|
|
77
|
+
* @param b See class javadoc
|
|
78
|
+
* @return the estimate of b over a
|
|
79
|
+
*/
|
|
80
|
+
static double get_estimate_of_b_over_a(uint64_t a, uint64_t b) {
|
|
81
|
+
check_inputs(a, b, 0.3);
|
|
82
|
+
if (a == 0) return 0.5;
|
|
83
|
+
return static_cast<double>(b) / static_cast<double>(a);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Return the estimate of A. See class javadoc.
|
|
88
|
+
* @param a See class javadoc
|
|
89
|
+
* @param f the inclusion probability used to produce the set with size <i>a</i>.
|
|
90
|
+
* @return the approximate lower bound
|
|
91
|
+
*/
|
|
92
|
+
static double estimate_of_a(uint64_t a, uint64_t f) {
|
|
93
|
+
check_inputs(a, 1, f);
|
|
94
|
+
return a / f;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Return the estimate of B. See class javadoc.
|
|
99
|
+
* @param b See class javadoc
|
|
100
|
+
* @param f the inclusion probability used to produce the set with size <i>b</i>.
|
|
101
|
+
* @return the approximate lower bound
|
|
102
|
+
*/
|
|
103
|
+
static double estimate_of_b(uint64_t b, double f) {
|
|
104
|
+
check_inputs(b + 1, b, f);
|
|
105
|
+
return b / f;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
private:
|
|
109
|
+
/**
|
|
110
|
+
* This hackyAdjuster is tightly coupled with the width of the confidence interval normally
|
|
111
|
+
* specified with number of standard deviations. To simplify this interface the number of
|
|
112
|
+
* standard deviations has been fixed to 2.0, which corresponds to a confidence interval of
|
|
113
|
+
* 95%.
|
|
114
|
+
* @param f the inclusion probability used to produce the set with size <i>a</i>.
|
|
115
|
+
* @return the hacky Adjuster
|
|
116
|
+
*/
|
|
117
|
+
static double hacky_adjuster(double f) {
|
|
118
|
+
const double tmp = sqrt(1.0 - f);
|
|
119
|
+
return (f <= 0.5) ? tmp : tmp + (0.01 * (f - 0.5));
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
static void check_inputs(uint64_t a, uint64_t b, double f) {
|
|
123
|
+
if (a < b) {
|
|
124
|
+
throw std::invalid_argument("a must be >= b: a = " + std::to_string(a) + ", b = " + std::to_string(b));
|
|
125
|
+
}
|
|
126
|
+
if ((f > 1.0) || (f <= 0.0)) {
|
|
127
|
+
throw std::invalid_argument("Required: ((f <= 1.0) && (f > 0.0)): " + std::to_string(f));
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
} /* namespace datasketches */
|
|
134
|
+
|
|
135
|
+
# endif
|