datasketches 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
@@ -0,0 +1,229 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch.hpp>
|
21
|
+
|
22
|
+
#include <kll_sketch.hpp>
|
23
|
+
#include <kll_helper.hpp>
|
24
|
+
|
25
|
+
#include <assert.h>
|
26
|
+
|
27
|
+
#ifdef KLL_VALIDATION
|
28
|
+
|
29
|
+
// This is to make sure the implementation matches exactly the reference implementation in OCaml.
|
30
|
+
// Conditional compilation is used because the implementation needs a few modifications:
|
31
|
+
// - switch from random choice to deterministic
|
32
|
+
// - a few methods to expose internals of the sketch
|
33
|
+
|
34
|
+
namespace datasketches {
|
35
|
+
|
36
|
+
uint32_t kll_next_offset; // to make kll_sketch deterministic
|
37
|
+
|
38
|
+
constexpr unsigned num_tests = 114;
|
39
|
+
|
40
|
+
const int64_t correct_results[num_tests * 7] = {
|
41
|
+
0, 200, 180, 3246533, 1, 180, 1098352976109474698,
|
42
|
+
1, 200, 198, 8349603, 1, 198, 686681527497651888,
|
43
|
+
2, 200, 217, 676491, 2, 117, 495856134049157644,
|
44
|
+
3, 200, 238, 3204507, 2, 138, 44453438498725402,
|
45
|
+
4, 200, 261, 2459373, 2, 161, 719830627391926938,
|
46
|
+
5, 200, 287, 5902143, 2, 187, 389303173170515580,
|
47
|
+
6, 200, 315, 5188793, 2, 215, 985218890825795000,
|
48
|
+
7, 200, 346, 801923, 2, 246, 589362992166904413,
|
49
|
+
8, 200, 380, 2466269, 2, 280, 1081848693781775853,
|
50
|
+
9, 200, 418, 5968041, 2, 318, 533825689515788397,
|
51
|
+
10, 200, 459, 3230027, 2, 243, 937332670315558786,
|
52
|
+
11, 200, 504, 5125875, 2, 288, 1019197831515566845,
|
53
|
+
12, 200, 554, 4195571, 3, 230, 797351479150148224,
|
54
|
+
13, 200, 609, 2221181, 3, 285, 451246040374318529,
|
55
|
+
14, 200, 669, 5865503, 3, 345, 253851269470815909,
|
56
|
+
15, 200, 735, 831703, 3, 411, 491974970526372303,
|
57
|
+
16, 200, 808, 4830785, 3, 327, 1032107507126916277,
|
58
|
+
17, 200, 888, 1356257, 3, 407, 215225420986342944,
|
59
|
+
18, 200, 976, 952071, 3, 417, 600280049738270697,
|
60
|
+
19, 200, 1073, 6729833, 3, 397, 341758522977365969,
|
61
|
+
20, 200, 1180, 6017925, 3, 406, 1080227312339182949,
|
62
|
+
21, 200, 1298, 4229891, 3, 401, 1092460534756675086,
|
63
|
+
22, 200, 1427, 7264889, 4, 320, 884533400696890024,
|
64
|
+
23, 200, 1569, 5836327, 4, 462, 660575800011134382,
|
65
|
+
24, 200, 1725, 5950087, 4, 416, 669373957401387528,
|
66
|
+
25, 200, 1897, 2692555, 4, 406, 607308667566496888,
|
67
|
+
26, 200, 2086, 1512443, 4, 459, 744260340112029032,
|
68
|
+
27, 200, 2294, 2681171, 4, 434, 199120609113802485,
|
69
|
+
28, 200, 2523, 3726521, 4, 450, 570993497599288304,
|
70
|
+
29, 200, 2775, 2695247, 4, 442, 306717093329516310,
|
71
|
+
30, 200, 3052, 5751175, 5, 400, 256024589545754217,
|
72
|
+
31, 200, 3357, 1148897, 5, 514, 507276662329207479,
|
73
|
+
32, 200, 3692, 484127, 5, 457, 1082660223488175122,
|
74
|
+
33, 200, 4061, 6414559, 5, 451, 620820308918522117,
|
75
|
+
34, 200, 4467, 5587461, 5, 466, 121975084804459305,
|
76
|
+
35, 200, 4913, 1615017, 5, 483, 152986529342916376,
|
77
|
+
36, 200, 5404, 6508535, 5, 492, 858526451332425960,
|
78
|
+
37, 200, 5944, 2991657, 5, 492, 624906434274621995,
|
79
|
+
38, 200, 6538, 6736565, 6, 511, 589153542019036049,
|
80
|
+
39, 200, 7191, 1579893, 6, 507, 10255312374117907,
|
81
|
+
40, 200, 7910, 412509, 6, 538, 570863587164194186,
|
82
|
+
41, 200, 8701, 1112089, 6, 477, 553100668286355347,
|
83
|
+
42, 200, 9571, 1258813, 6, 526, 344845406406036297,
|
84
|
+
43, 200, 10528, 1980049, 6, 508, 411846569527905064,
|
85
|
+
44, 200, 11580, 2167127, 6, 520, 966876726203675488,
|
86
|
+
45, 200, 12738, 1975435, 7, 561, 724125506920592732,
|
87
|
+
46, 200, 14011, 4289627, 7, 560, 753686005174215572,
|
88
|
+
47, 200, 15412, 5384001, 7, 494, 551637841878573955,
|
89
|
+
48, 200, 16953, 2902685, 7, 560, 94602851752354802,
|
90
|
+
49, 200, 18648, 4806445, 7, 562, 597672400688514221,
|
91
|
+
50, 200, 20512, 2085, 7, 529, 417280161591969960,
|
92
|
+
51, 200, 22563, 6375939, 7, 558, 11300453985206678,
|
93
|
+
52, 200, 24819, 7837057, 7, 559, 283668599967437754,
|
94
|
+
53, 200, 27300, 6607975, 8, 561, 122183647493325363,
|
95
|
+
54, 200, 30030, 1519191, 8, 550, 1145227891427321202,
|
96
|
+
55, 200, 33033, 808061, 8, 568, 71070843834364939,
|
97
|
+
56, 200, 36336, 2653529, 8, 570, 450311772805359006,
|
98
|
+
57, 200, 39969, 2188957, 8, 561, 269670427054904115,
|
99
|
+
58, 200, 43965, 5885655, 8, 539, 1039064186324091890,
|
100
|
+
59, 200, 48361, 6185889, 8, 574, 178055275082387938,
|
101
|
+
60, 200, 53197, 208767, 9, 579, 139766040442973048,
|
102
|
+
61, 200, 58516, 2551345, 9, 569, 322655279254252950,
|
103
|
+
62, 200, 64367, 1950873, 9, 569, 101542216315768285,
|
104
|
+
63, 200, 70803, 2950429, 9, 582, 72294008568551853,
|
105
|
+
64, 200, 77883, 3993977, 9, 572, 299014330559512530,
|
106
|
+
65, 200, 85671, 428871, 9, 585, 491351721800568188,
|
107
|
+
66, 200, 94238, 6740849, 9, 577, 656204268858348899,
|
108
|
+
67, 200, 103661, 2315497, 9, 562, 829926273188300764,
|
109
|
+
68, 200, 114027, 5212835, 10, 581, 542222554617639557,
|
110
|
+
69, 200, 125429, 4213475, 10, 593, 713339189579860773,
|
111
|
+
70, 200, 137971, 2411583, 10, 592, 649651658985845357,
|
112
|
+
71, 200, 151768, 5243307, 10, 567, 1017459402785275179,
|
113
|
+
72, 200, 166944, 2468367, 10, 593, 115034451827634398,
|
114
|
+
73, 200, 183638, 2210923, 10, 583, 365735165000548572,
|
115
|
+
74, 200, 202001, 321257, 10, 591, 928479940794929153,
|
116
|
+
75, 200, 222201, 8185105, 11, 600, 780163958693677795,
|
117
|
+
76, 200, 244421, 6205349, 11, 598, 132454307780236135,
|
118
|
+
77, 200, 268863, 3165901, 11, 600, 369824066179493948,
|
119
|
+
78, 200, 295749, 2831723, 11, 595, 80968411797441666,
|
120
|
+
79, 200, 325323, 464193, 11, 594, 125773061716381917,
|
121
|
+
80, 200, 357855, 7499035, 11, 576, 994150328579932916,
|
122
|
+
81, 200, 393640, 1514479, 11, 596, 111092193875842594,
|
123
|
+
82, 200, 433004, 668493, 12, 607, 497338041653302784,
|
124
|
+
83, 200, 476304, 3174931, 12, 606, 845986926165673887,
|
125
|
+
84, 200, 523934, 914611, 12, 605, 354993119685278556,
|
126
|
+
85, 200, 576327, 7270385, 12, 602, 937679531753465428,
|
127
|
+
86, 200, 633959, 1956979, 12, 598, 659413123921208266,
|
128
|
+
87, 200, 697354, 3137635, 12, 606, 874228711599628459,
|
129
|
+
88, 200, 767089, 214923, 12, 608, 1077644643342432307,
|
130
|
+
89, 200, 843797, 3084545, 13, 612, 79317113064339979,
|
131
|
+
90, 200, 928176, 7800899, 13, 612, 357414065779796772,
|
132
|
+
91, 200, 1020993, 6717253, 13, 615, 532723577905833296,
|
133
|
+
92, 200, 1123092, 5543015, 13, 614, 508695073250223746,
|
134
|
+
93, 200, 1235401, 298785, 13, 616, 34344606952783179,
|
135
|
+
94, 200, 1358941, 4530313, 13, 607, 169924026179364121,
|
136
|
+
95, 200, 1494835, 4406457, 13, 612, 1026773494313671061,
|
137
|
+
96, 200, 1644318, 1540983, 13, 614, 423454640036650614,
|
138
|
+
97, 200, 1808749, 7999631, 14, 624, 466122870338520329,
|
139
|
+
98, 200, 1989623, 4295537, 14, 621, 609309853701283445,
|
140
|
+
99, 200, 2188585, 7379971, 14, 622, 141739898871015642,
|
141
|
+
100, 200, 2407443, 6188931, 14, 621, 22515080776738923,
|
142
|
+
101, 200, 2648187, 6701239, 14, 619, 257441864177795548,
|
143
|
+
102, 200, 2913005, 2238709, 14, 623, 867028825821064773,
|
144
|
+
103, 200, 3204305, 5371075, 14, 625, 1110615471273395112,
|
145
|
+
104, 200, 3524735, 7017341, 15, 631, 619518037415974467,
|
146
|
+
105, 200, 3877208, 323337, 15, 633, 513230912593541122,
|
147
|
+
106, 200, 4264928, 6172471, 15, 628, 885861662583325072,
|
148
|
+
107, 200, 4691420, 5653803, 15, 633, 754052473303005204,
|
149
|
+
108, 200, 5160562, 1385265, 15, 630, 294993765757975100,
|
150
|
+
109, 200, 5676618, 4350899, 15, 617, 1073144684944932303,
|
151
|
+
110, 200, 6244279, 1272235, 15, 630, 308982934296855020,
|
152
|
+
111, 200, 6868706, 1763939, 16, 638, 356231694823272867,
|
153
|
+
112, 200, 7555576, 3703411, 16, 636, 20043268926300101,
|
154
|
+
113, 200, 8311133, 6554171, 16, 637, 121111429906734123
|
155
|
+
};
|
156
|
+
|
157
|
+
static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
|
158
|
+
assert (kll_helper::is_odd(stride));
|
159
|
+
unsigned mask((1 << 23) - 1); // because library items are single-precision floats at the moment
|
160
|
+
unsigned cur(0);
|
161
|
+
std::unique_ptr<int[]> arr(new int[n]);
|
162
|
+
for (unsigned i = 0; i < n; i++) {
|
163
|
+
cur += stride;
|
164
|
+
cur &= mask;
|
165
|
+
arr[i] = cur;
|
166
|
+
}
|
167
|
+
return arr;
|
168
|
+
}
|
169
|
+
|
170
|
+
static int64_t simple_hash_of_sub_array(const float* arr, unsigned start, unsigned length) {
|
171
|
+
int64_t multiplier(738219921); // an arbitrary odd 30-bit number
|
172
|
+
int64_t mask60((1ULL << 60) - 1ULL);
|
173
|
+
int64_t accum(0);
|
174
|
+
for (unsigned i = start; i < start + length; i++) {
|
175
|
+
accum += (int64_t) arr[i];
|
176
|
+
accum *= multiplier;
|
177
|
+
accum &= mask60;
|
178
|
+
accum ^= accum >> 30;
|
179
|
+
}
|
180
|
+
return accum;
|
181
|
+
}
|
182
|
+
|
183
|
+
TEST_CASE("kll validation", "[kll_sketch][validation]") {
|
184
|
+
for (unsigned i = 0; i < num_tests; i++) {
|
185
|
+
assert (correct_results[7 * i] == i);
|
186
|
+
unsigned k(correct_results[7 * i + 1]);
|
187
|
+
unsigned n(correct_results[7 * i + 2]);
|
188
|
+
unsigned stride(correct_results[7 * i + 3]);
|
189
|
+
std::unique_ptr<int[]> input_array = make_input_array(n, stride);
|
190
|
+
kll_sketch<float> sketch(k);
|
191
|
+
kll_next_offset = 0;
|
192
|
+
for (unsigned j = 0; j < n; j++) {
|
193
|
+
sketch.update(input_array[j]);
|
194
|
+
}
|
195
|
+
unsigned num_levels = sketch.get_num_levels();
|
196
|
+
unsigned num_samples = sketch.get_num_retained();
|
197
|
+
int64_t hashed_samples = simple_hash_of_sub_array(sketch.get_items(), sketch.get_levels()[0], num_samples);
|
198
|
+
std::cout << i;
|
199
|
+
REQUIRE(correct_results[7 * i + 4] == num_levels);
|
200
|
+
REQUIRE(correct_results[7 * i + 5] == num_samples);
|
201
|
+
if (correct_results[7 * i + 6] == hashed_samples) {
|
202
|
+
std::cout << " pass" << std::endl;
|
203
|
+
} else {
|
204
|
+
std::cout << " " << (correct_results[7 * i + 6]) << " != " << hashed_samples;
|
205
|
+
sketch.to_stream(std::cout);
|
206
|
+
FAIL();
|
207
|
+
}
|
208
|
+
}
|
209
|
+
}
|
210
|
+
|
211
|
+
TEST_CASE("kll validation: test hash", "[kll_sketch][validaiton]") {
|
212
|
+
float array[] = { 907500, 944104, 807020, 219921, 678370, 955217, 426885 };
|
213
|
+
REQUIRE(simple_hash_of_sub_array(array, 1, 5) == 1141543353991880193LL);
|
214
|
+
}
|
215
|
+
|
216
|
+
TEST_CASE("kll validation: make input array", "[kll_sketch][validaiton]") {
|
217
|
+
int expected_array[6] = { 3654721, 7309442, 2575555, 6230276, 1496389, 5151110 };
|
218
|
+
auto array(make_input_array(6, 3654721));
|
219
|
+
REQUIRE(array[0] == expected_array[0]);
|
220
|
+
REQUIRE(array[1] == expected_array[1]);
|
221
|
+
REQUIRE(array[2] == expected_array[2]);
|
222
|
+
REQUIRE(array[3] == expected_array[3]);
|
223
|
+
REQUIRE(array[4] == expected_array[4]);
|
224
|
+
REQUIRE(array[5] == expected_array[5]);
|
225
|
+
}
|
226
|
+
|
227
|
+
} /* namespace datasketches */
|
228
|
+
|
229
|
+
#endif
|
@@ -0,0 +1,17 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["wheel",
|
3
|
+
"setuptools >= 30.3.0",
|
4
|
+
"setuptools_scm",
|
5
|
+
"cmake >= 3.12"]
|
6
|
+
|
7
|
+
[tool.tox]
|
8
|
+
legacy_tox_ini = """
|
9
|
+
[tox]
|
10
|
+
envlist = py3
|
11
|
+
|
12
|
+
[testenv]
|
13
|
+
deps = pytest
|
14
|
+
numpy
|
15
|
+
changedir = python/tests
|
16
|
+
commands = pytest
|
17
|
+
"""
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
# TODO: Can we force python version >= 3.0?
|
19
|
+
if (MSVC)
|
20
|
+
set(PYBIND11_CPP_STANDARD /std:c++11)
|
21
|
+
else()
|
22
|
+
set(PYBIND11_CPP_STANDARD -std=c++11)
|
23
|
+
endif()
|
24
|
+
|
25
|
+
add_subdirectory(pybind11)
|
26
|
+
|
27
|
+
pybind11_add_module(python MODULE EXCLUDE_FROM_ALL SYSTEM THIN_LTO)
|
28
|
+
|
29
|
+
target_link_libraries(python
|
30
|
+
PRIVATE
|
31
|
+
common
|
32
|
+
hll
|
33
|
+
kll
|
34
|
+
cpc
|
35
|
+
fi
|
36
|
+
theta
|
37
|
+
sampling
|
38
|
+
pybind11::module
|
39
|
+
)
|
40
|
+
|
41
|
+
set_target_properties(python PROPERTIES
|
42
|
+
PREFIX ""
|
43
|
+
OUTPUT_NAME datasketches
|
44
|
+
)
|
45
|
+
|
46
|
+
# ensure we make a .so on Mac rather than .dylib
|
47
|
+
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
48
|
+
set_target_properties(python PROPERTIES SUFFIX ".so")
|
49
|
+
endif()
|
50
|
+
|
51
|
+
target_sources(python
|
52
|
+
PRIVATE
|
53
|
+
src/datasketches.cpp
|
54
|
+
src/hll_wrapper.cpp
|
55
|
+
src/kll_wrapper.cpp
|
56
|
+
src/cpc_wrapper.cpp
|
57
|
+
src/fi_wrapper.cpp
|
58
|
+
src/theta_wrapper.cpp
|
59
|
+
src/vo_wrapper.cpp
|
60
|
+
src/vector_of_kll.cpp
|
61
|
+
)
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# Python Wrapper for Datasketches
|
2
|
+
|
3
|
+
## Installation
|
4
|
+
|
5
|
+
The release files do not include the needed python binding library ([pybind11](https://github.com/pybind/pybind11)). If building
|
6
|
+
from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
|
7
|
+
|
8
|
+
An official pypi build is eventually planned but not yet available.
|
9
|
+
|
10
|
+
If you instead want to take a (possibly ill-advised) gamble on the current state of the master branch being useable, you can run:
|
11
|
+
```pip install git+https://github.com/apache/datasketches-cpp.git```
|
12
|
+
|
13
|
+
## Developer Instructions
|
14
|
+
|
15
|
+
### Building
|
16
|
+
|
17
|
+
When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
|
18
|
+
```
|
19
|
+
git clone --recursive https://github.com/apache/datasketches-cpp.git
|
20
|
+
cd datasketches-cpp
|
21
|
+
python -m pip install --upgrade pip setuptools wheel numpy
|
22
|
+
python setup.py build
|
23
|
+
```
|
24
|
+
|
25
|
+
If you cloned without `--recursive`, you can add the submodule post-checkout using `git submodule update --init --recursive`.
|
26
|
+
|
27
|
+
### Installing
|
28
|
+
|
29
|
+
Assuming you have already checked out the library and any dependent submodules, install by simply replacing the lsat
|
30
|
+
line of the build command with `python setup.py install`.
|
31
|
+
|
32
|
+
### Unit tests
|
33
|
+
|
34
|
+
The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
|
35
|
+
```
|
36
|
+
python -m pip install --upgrade pip setuptools wheel numpy tox
|
37
|
+
tox
|
38
|
+
```
|
39
|
+
|
40
|
+
## Usage
|
41
|
+
|
42
|
+
Having installed the library, loading the Datasketches library in Python is simple: `import datasketches`.
|
43
|
+
|
44
|
+
## Available Sketch Classes
|
45
|
+
|
46
|
+
- KLL
|
47
|
+
- `kll_ints_sketch`
|
48
|
+
- `kll_floats_sketch`
|
49
|
+
- Frequent Items
|
50
|
+
- `frequent_strings_sketch`
|
51
|
+
- Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
|
52
|
+
- Theta
|
53
|
+
- `update_theta_sketch`
|
54
|
+
- `compact_theta_sketch` (cannot be instantiated directly)
|
55
|
+
- `theta_union`
|
56
|
+
- `theta_intersection`
|
57
|
+
- `theta_a_not_b`
|
58
|
+
- HLL
|
59
|
+
- `hll_sketch`
|
60
|
+
- `hll_union`
|
61
|
+
- Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
|
62
|
+
- CPC
|
63
|
+
- `cpc_sketch`
|
64
|
+
- `cpc_union`
|
65
|
+
- VarOpt Sampling
|
66
|
+
- `var_opt_sketch`
|
67
|
+
- `var_opt_union`
|
68
|
+
- Vector of KLL
|
69
|
+
- `vector_of_kll_ints_sketches`
|
70
|
+
- `vector_of_kll_floats_sketches`
|
71
|
+
|
72
|
+
## Known Differences from C++
|
73
|
+
|
74
|
+
The Python API largely mirrors the C++ API, with a few minor exceptions: The primary known differences are that Python on modern platforms does not support unsigned integer values or numeric values with fewer than 64 bits. As a result, you may not be able to produce identical sketches from within Python as you can with Java and C++. Loading those sketches after they have been serialized from another language will work as expected.
|
75
|
+
|
76
|
+
The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
|
77
|
+
|
78
|
+
We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
|
@@ -0,0 +1,345 @@
|
|
1
|
+
{
|
2
|
+
"cells": [
|
3
|
+
{
|
4
|
+
"cell_type": "markdown",
|
5
|
+
"metadata": {},
|
6
|
+
"source": [
|
7
|
+
"## CPC Sketch Examples"
|
8
|
+
]
|
9
|
+
},
|
10
|
+
{
|
11
|
+
"cell_type": "markdown",
|
12
|
+
"metadata": {},
|
13
|
+
"source": [
|
14
|
+
"### Basic Sketch Usage"
|
15
|
+
]
|
16
|
+
},
|
17
|
+
{
|
18
|
+
"cell_type": "code",
|
19
|
+
"execution_count": 1,
|
20
|
+
"metadata": {},
|
21
|
+
"outputs": [],
|
22
|
+
"source": [
|
23
|
+
"from datasketches import cpc_sketch, cpc_union"
|
24
|
+
]
|
25
|
+
},
|
26
|
+
{
|
27
|
+
"cell_type": "markdown",
|
28
|
+
"metadata": {},
|
29
|
+
"source": [
|
30
|
+
"We'll create a sketch with log2(k) = 12"
|
31
|
+
]
|
32
|
+
},
|
33
|
+
{
|
34
|
+
"cell_type": "code",
|
35
|
+
"execution_count": 2,
|
36
|
+
"metadata": {},
|
37
|
+
"outputs": [],
|
38
|
+
"source": [
|
39
|
+
"sk = cpc_sketch(12)"
|
40
|
+
]
|
41
|
+
},
|
42
|
+
{
|
43
|
+
"cell_type": "markdown",
|
44
|
+
"metadata": {},
|
45
|
+
"source": [
|
46
|
+
"Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes."
|
47
|
+
]
|
48
|
+
},
|
49
|
+
{
|
50
|
+
"cell_type": "code",
|
51
|
+
"execution_count": 3,
|
52
|
+
"metadata": {},
|
53
|
+
"outputs": [
|
54
|
+
{
|
55
|
+
"name": "stdout",
|
56
|
+
"output_type": "stream",
|
57
|
+
"text": [
|
58
|
+
"### CPC sketch summary:\n",
|
59
|
+
" lgK : 12\n",
|
60
|
+
" seed hash : 93cc\n",
|
61
|
+
" C : 38212\n",
|
62
|
+
" flavor : 4\n",
|
63
|
+
" merged : false\n",
|
64
|
+
" compressed : false\n",
|
65
|
+
" intresting col : 5\n",
|
66
|
+
" HIP estimate : 2.09721e+06\n",
|
67
|
+
" kxp : 11.4725\n",
|
68
|
+
" offset : 6\n",
|
69
|
+
" table : allocated\n",
|
70
|
+
" num SV : 135\n",
|
71
|
+
" window : allocated\n",
|
72
|
+
"### End sketch summary\n",
|
73
|
+
"\n"
|
74
|
+
]
|
75
|
+
}
|
76
|
+
],
|
77
|
+
"source": [
|
78
|
+
"n = 1 << 21\n",
|
79
|
+
"for i in range(0, n):\n",
|
80
|
+
" sk.update(i)\n",
|
81
|
+
"print(sk)"
|
82
|
+
]
|
83
|
+
},
|
84
|
+
{
|
85
|
+
"cell_type": "markdown",
|
86
|
+
"metadata": {},
|
87
|
+
"source": [
|
88
|
+
"Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation)."
|
89
|
+
]
|
90
|
+
},
|
91
|
+
{
|
92
|
+
"cell_type": "code",
|
93
|
+
"execution_count": 4,
|
94
|
+
"metadata": {},
|
95
|
+
"outputs": [
|
96
|
+
{
|
97
|
+
"name": "stdout",
|
98
|
+
"output_type": "stream",
|
99
|
+
"text": [
|
100
|
+
"Upper bound (1 std. dev) as % of true value: 100.9281\n"
|
101
|
+
]
|
102
|
+
}
|
103
|
+
],
|
104
|
+
"source": [
|
105
|
+
"print(\"Upper bound (1 std. dev) as % of true value: \", round(100*sk.get_upper_bound(1) / n, 4))"
|
106
|
+
]
|
107
|
+
},
|
108
|
+
{
|
109
|
+
"cell_type": "code",
|
110
|
+
"execution_count": 5,
|
111
|
+
"metadata": {},
|
112
|
+
"outputs": [
|
113
|
+
{
|
114
|
+
"name": "stdout",
|
115
|
+
"output_type": "stream",
|
116
|
+
"text": [
|
117
|
+
"Estimate as % of true value: 100.0026\n"
|
118
|
+
]
|
119
|
+
}
|
120
|
+
],
|
121
|
+
"source": [
|
122
|
+
"print(\"Estimate as % of true value: \", round(100*sk.get_estimate() / n, 4))"
|
123
|
+
]
|
124
|
+
},
|
125
|
+
{
|
126
|
+
"cell_type": "code",
|
127
|
+
"execution_count": 6,
|
128
|
+
"metadata": {},
|
129
|
+
"outputs": [
|
130
|
+
{
|
131
|
+
"name": "stdout",
|
132
|
+
"output_type": "stream",
|
133
|
+
"text": [
|
134
|
+
"Lower bound (1 std. dev) as % of true value: 99.0935\n"
|
135
|
+
]
|
136
|
+
}
|
137
|
+
],
|
138
|
+
"source": [
|
139
|
+
"print(\"Lower bound (1 std. dev) as % of true value: \", round(100*sk.get_lower_bound(1) / n, 4))"
|
140
|
+
]
|
141
|
+
},
|
142
|
+
{
|
143
|
+
"cell_type": "markdown",
|
144
|
+
"metadata": {},
|
145
|
+
"source": [
|
146
|
+
"Finally, we can serialize and deserialize the sketch, which will give us back the same structure."
|
147
|
+
]
|
148
|
+
},
|
149
|
+
{
|
150
|
+
"cell_type": "code",
|
151
|
+
"execution_count": 7,
|
152
|
+
"metadata": {},
|
153
|
+
"outputs": [
|
154
|
+
{
|
155
|
+
"data": {
|
156
|
+
"text/plain": [
|
157
|
+
"2484"
|
158
|
+
]
|
159
|
+
},
|
160
|
+
"execution_count": 7,
|
161
|
+
"metadata": {},
|
162
|
+
"output_type": "execute_result"
|
163
|
+
}
|
164
|
+
],
|
165
|
+
"source": [
|
166
|
+
"sk_bytes = sk.serialize()\n",
|
167
|
+
"len(sk_bytes)"
|
168
|
+
]
|
169
|
+
},
|
170
|
+
{
|
171
|
+
"cell_type": "code",
|
172
|
+
"execution_count": 8,
|
173
|
+
"metadata": {},
|
174
|
+
"outputs": [
|
175
|
+
{
|
176
|
+
"name": "stdout",
|
177
|
+
"output_type": "stream",
|
178
|
+
"text": [
|
179
|
+
"### CPC sketch summary:\n",
|
180
|
+
" lgK : 12\n",
|
181
|
+
" seed hash : 93cc\n",
|
182
|
+
" C : 38212\n",
|
183
|
+
" flavor : 4\n",
|
184
|
+
" merged : false\n",
|
185
|
+
" compressed : false\n",
|
186
|
+
" intresting col : 5\n",
|
187
|
+
" HIP estimate : 2.09721e+06\n",
|
188
|
+
" kxp : 11.4725\n",
|
189
|
+
" offset : 6\n",
|
190
|
+
" table : allocated\n",
|
191
|
+
" num SV : 135\n",
|
192
|
+
" window : allocated\n",
|
193
|
+
"### End sketch summary\n",
|
194
|
+
"\n"
|
195
|
+
]
|
196
|
+
}
|
197
|
+
],
|
198
|
+
"source": [
|
199
|
+
"sk2 = cpc_sketch.deserialize(sk_bytes)\n",
|
200
|
+
"print(sk2)"
|
201
|
+
]
|
202
|
+
},
|
203
|
+
{
|
204
|
+
"cell_type": "markdown",
|
205
|
+
"metadata": {},
|
206
|
+
"source": [
|
207
|
+
"### Sketch Union Usage"
|
208
|
+
]
|
209
|
+
},
|
210
|
+
{
|
211
|
+
"cell_type": "markdown",
|
212
|
+
"metadata": {},
|
213
|
+
"source": [
|
214
|
+
"Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data."
|
215
|
+
]
|
216
|
+
},
|
217
|
+
{
|
218
|
+
"cell_type": "code",
|
219
|
+
"execution_count": 9,
|
220
|
+
"metadata": {},
|
221
|
+
"outputs": [],
|
222
|
+
"source": [
|
223
|
+
"k = 12\n",
|
224
|
+
"n = 1 << 20\n",
|
225
|
+
"offset = int(3 * n / 4)"
|
226
|
+
]
|
227
|
+
},
|
228
|
+
{
|
229
|
+
"cell_type": "code",
|
230
|
+
"execution_count": 10,
|
231
|
+
"metadata": {},
|
232
|
+
"outputs": [],
|
233
|
+
"source": [
|
234
|
+
"sk1 = cpc_sketch(k)\n",
|
235
|
+
"sk2 = cpc_sketch(k + 1)\n",
|
236
|
+
"for i in range(0, n):\n",
|
237
|
+
" sk1.update(i)\n",
|
238
|
+
" sk2.update(i + offset)"
|
239
|
+
]
|
240
|
+
},
|
241
|
+
{
|
242
|
+
"cell_type": "markdown",
|
243
|
+
"metadata": {},
|
244
|
+
"source": [
|
245
|
+
"Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here."
|
246
|
+
]
|
247
|
+
},
|
248
|
+
{
|
249
|
+
"cell_type": "code",
|
250
|
+
"execution_count": 11,
|
251
|
+
"metadata": {},
|
252
|
+
"outputs": [],
|
253
|
+
"source": [
|
254
|
+
"union = cpc_union(k+1)\n",
|
255
|
+
"union.update(sk1)\n",
|
256
|
+
"union.update(sk2)"
|
257
|
+
]
|
258
|
+
},
|
259
|
+
{
|
260
|
+
"cell_type": "markdown",
|
261
|
+
"metadata": {},
|
262
|
+
"source": [
|
263
|
+
"Note how log config k has automatically adopted the value of the smaller input sketch."
|
264
|
+
]
|
265
|
+
},
|
266
|
+
{
|
267
|
+
"cell_type": "code",
|
268
|
+
"execution_count": 12,
|
269
|
+
"metadata": {},
|
270
|
+
"outputs": [
|
271
|
+
{
|
272
|
+
"name": "stdout",
|
273
|
+
"output_type": "stream",
|
274
|
+
"text": [
|
275
|
+
"### CPC sketch summary:\n",
|
276
|
+
" lgK : 12\n",
|
277
|
+
" seed hash : 93cc\n",
|
278
|
+
" C : 37418\n",
|
279
|
+
" flavor : 4\n",
|
280
|
+
" merged : true\n",
|
281
|
+
" compressed : false\n",
|
282
|
+
" intresting col : 5\n",
|
283
|
+
" HIP estimate : 0\n",
|
284
|
+
" kxp : 4096\n",
|
285
|
+
" offset : 6\n",
|
286
|
+
" table : allocated\n",
|
287
|
+
" num SV : 123\n",
|
288
|
+
" window : allocated\n",
|
289
|
+
"### End sketch summary\n",
|
290
|
+
"\n"
|
291
|
+
]
|
292
|
+
}
|
293
|
+
],
|
294
|
+
"source": [
|
295
|
+
"result = union.get_result()\n",
|
296
|
+
"print(result)"
|
297
|
+
]
|
298
|
+
},
|
299
|
+
{
|
300
|
+
"cell_type": "markdown",
|
301
|
+
"metadata": {},
|
302
|
+
"source": [
|
303
|
+
"We can again compare against the exact result, in this case 1.75*n"
|
304
|
+
]
|
305
|
+
},
|
306
|
+
{
|
307
|
+
"cell_type": "code",
|
308
|
+
"execution_count": 13,
|
309
|
+
"metadata": {},
|
310
|
+
"outputs": [
|
311
|
+
{
|
312
|
+
"name": "stdout",
|
313
|
+
"output_type": "stream",
|
314
|
+
"text": [
|
315
|
+
"Estimate as % of true value: 99.6646\n"
|
316
|
+
]
|
317
|
+
}
|
318
|
+
],
|
319
|
+
"source": [
|
320
|
+
"print(\"Estimate as % of true value: \", round(100*result.get_estimate() / (7*n/4), 4))"
|
321
|
+
]
|
322
|
+
}
|
323
|
+
],
|
324
|
+
"metadata": {
|
325
|
+
"kernelspec": {
|
326
|
+
"display_name": "Python 3",
|
327
|
+
"language": "python",
|
328
|
+
"name": "python3"
|
329
|
+
},
|
330
|
+
"language_info": {
|
331
|
+
"codemirror_mode": {
|
332
|
+
"name": "ipython",
|
333
|
+
"version": 3
|
334
|
+
},
|
335
|
+
"file_extension": ".py",
|
336
|
+
"mimetype": "text/x-python",
|
337
|
+
"name": "python",
|
338
|
+
"nbconvert_exporter": "python",
|
339
|
+
"pygments_lexer": "ipython3",
|
340
|
+
"version": "3.7.0"
|
341
|
+
}
|
342
|
+
},
|
343
|
+
"nbformat": 4,
|
344
|
+
"nbformat_minor": 2
|
345
|
+
}
|