datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch.hpp>
|
|
21
|
+
|
|
22
|
+
#include <kll_sketch.hpp>
|
|
23
|
+
#include <kll_helper.hpp>
|
|
24
|
+
|
|
25
|
+
#include <assert.h>
|
|
26
|
+
|
|
27
|
+
#ifdef KLL_VALIDATION
|
|
28
|
+
|
|
29
|
+
// This is to make sure the implementation matches exactly the reference implementation in OCaml.
|
|
30
|
+
// Conditional compilation is used because the implementation needs a few modifications:
|
|
31
|
+
// - switch from random choice to deterministic
|
|
32
|
+
// - a few methods to expose internals of the sketch
|
|
33
|
+
|
|
34
|
+
namespace datasketches {
|
|
35
|
+
|
|
36
|
+
uint32_t kll_next_offset; // to make kll_sketch deterministic
|
|
37
|
+
|
|
38
|
+
constexpr unsigned num_tests = 114;
|
|
39
|
+
|
|
40
|
+
const int64_t correct_results[num_tests * 7] = {
|
|
41
|
+
0, 200, 180, 3246533, 1, 180, 1098352976109474698,
|
|
42
|
+
1, 200, 198, 8349603, 1, 198, 686681527497651888,
|
|
43
|
+
2, 200, 217, 676491, 2, 117, 495856134049157644,
|
|
44
|
+
3, 200, 238, 3204507, 2, 138, 44453438498725402,
|
|
45
|
+
4, 200, 261, 2459373, 2, 161, 719830627391926938,
|
|
46
|
+
5, 200, 287, 5902143, 2, 187, 389303173170515580,
|
|
47
|
+
6, 200, 315, 5188793, 2, 215, 985218890825795000,
|
|
48
|
+
7, 200, 346, 801923, 2, 246, 589362992166904413,
|
|
49
|
+
8, 200, 380, 2466269, 2, 280, 1081848693781775853,
|
|
50
|
+
9, 200, 418, 5968041, 2, 318, 533825689515788397,
|
|
51
|
+
10, 200, 459, 3230027, 2, 243, 937332670315558786,
|
|
52
|
+
11, 200, 504, 5125875, 2, 288, 1019197831515566845,
|
|
53
|
+
12, 200, 554, 4195571, 3, 230, 797351479150148224,
|
|
54
|
+
13, 200, 609, 2221181, 3, 285, 451246040374318529,
|
|
55
|
+
14, 200, 669, 5865503, 3, 345, 253851269470815909,
|
|
56
|
+
15, 200, 735, 831703, 3, 411, 491974970526372303,
|
|
57
|
+
16, 200, 808, 4830785, 3, 327, 1032107507126916277,
|
|
58
|
+
17, 200, 888, 1356257, 3, 407, 215225420986342944,
|
|
59
|
+
18, 200, 976, 952071, 3, 417, 600280049738270697,
|
|
60
|
+
19, 200, 1073, 6729833, 3, 397, 341758522977365969,
|
|
61
|
+
20, 200, 1180, 6017925, 3, 406, 1080227312339182949,
|
|
62
|
+
21, 200, 1298, 4229891, 3, 401, 1092460534756675086,
|
|
63
|
+
22, 200, 1427, 7264889, 4, 320, 884533400696890024,
|
|
64
|
+
23, 200, 1569, 5836327, 4, 462, 660575800011134382,
|
|
65
|
+
24, 200, 1725, 5950087, 4, 416, 669373957401387528,
|
|
66
|
+
25, 200, 1897, 2692555, 4, 406, 607308667566496888,
|
|
67
|
+
26, 200, 2086, 1512443, 4, 459, 744260340112029032,
|
|
68
|
+
27, 200, 2294, 2681171, 4, 434, 199120609113802485,
|
|
69
|
+
28, 200, 2523, 3726521, 4, 450, 570993497599288304,
|
|
70
|
+
29, 200, 2775, 2695247, 4, 442, 306717093329516310,
|
|
71
|
+
30, 200, 3052, 5751175, 5, 400, 256024589545754217,
|
|
72
|
+
31, 200, 3357, 1148897, 5, 514, 507276662329207479,
|
|
73
|
+
32, 200, 3692, 484127, 5, 457, 1082660223488175122,
|
|
74
|
+
33, 200, 4061, 6414559, 5, 451, 620820308918522117,
|
|
75
|
+
34, 200, 4467, 5587461, 5, 466, 121975084804459305,
|
|
76
|
+
35, 200, 4913, 1615017, 5, 483, 152986529342916376,
|
|
77
|
+
36, 200, 5404, 6508535, 5, 492, 858526451332425960,
|
|
78
|
+
37, 200, 5944, 2991657, 5, 492, 624906434274621995,
|
|
79
|
+
38, 200, 6538, 6736565, 6, 511, 589153542019036049,
|
|
80
|
+
39, 200, 7191, 1579893, 6, 507, 10255312374117907,
|
|
81
|
+
40, 200, 7910, 412509, 6, 538, 570863587164194186,
|
|
82
|
+
41, 200, 8701, 1112089, 6, 477, 553100668286355347,
|
|
83
|
+
42, 200, 9571, 1258813, 6, 526, 344845406406036297,
|
|
84
|
+
43, 200, 10528, 1980049, 6, 508, 411846569527905064,
|
|
85
|
+
44, 200, 11580, 2167127, 6, 520, 966876726203675488,
|
|
86
|
+
45, 200, 12738, 1975435, 7, 561, 724125506920592732,
|
|
87
|
+
46, 200, 14011, 4289627, 7, 560, 753686005174215572,
|
|
88
|
+
47, 200, 15412, 5384001, 7, 494, 551637841878573955,
|
|
89
|
+
48, 200, 16953, 2902685, 7, 560, 94602851752354802,
|
|
90
|
+
49, 200, 18648, 4806445, 7, 562, 597672400688514221,
|
|
91
|
+
50, 200, 20512, 2085, 7, 529, 417280161591969960,
|
|
92
|
+
51, 200, 22563, 6375939, 7, 558, 11300453985206678,
|
|
93
|
+
52, 200, 24819, 7837057, 7, 559, 283668599967437754,
|
|
94
|
+
53, 200, 27300, 6607975, 8, 561, 122183647493325363,
|
|
95
|
+
54, 200, 30030, 1519191, 8, 550, 1145227891427321202,
|
|
96
|
+
55, 200, 33033, 808061, 8, 568, 71070843834364939,
|
|
97
|
+
56, 200, 36336, 2653529, 8, 570, 450311772805359006,
|
|
98
|
+
57, 200, 39969, 2188957, 8, 561, 269670427054904115,
|
|
99
|
+
58, 200, 43965, 5885655, 8, 539, 1039064186324091890,
|
|
100
|
+
59, 200, 48361, 6185889, 8, 574, 178055275082387938,
|
|
101
|
+
60, 200, 53197, 208767, 9, 579, 139766040442973048,
|
|
102
|
+
61, 200, 58516, 2551345, 9, 569, 322655279254252950,
|
|
103
|
+
62, 200, 64367, 1950873, 9, 569, 101542216315768285,
|
|
104
|
+
63, 200, 70803, 2950429, 9, 582, 72294008568551853,
|
|
105
|
+
64, 200, 77883, 3993977, 9, 572, 299014330559512530,
|
|
106
|
+
65, 200, 85671, 428871, 9, 585, 491351721800568188,
|
|
107
|
+
66, 200, 94238, 6740849, 9, 577, 656204268858348899,
|
|
108
|
+
67, 200, 103661, 2315497, 9, 562, 829926273188300764,
|
|
109
|
+
68, 200, 114027, 5212835, 10, 581, 542222554617639557,
|
|
110
|
+
69, 200, 125429, 4213475, 10, 593, 713339189579860773,
|
|
111
|
+
70, 200, 137971, 2411583, 10, 592, 649651658985845357,
|
|
112
|
+
71, 200, 151768, 5243307, 10, 567, 1017459402785275179,
|
|
113
|
+
72, 200, 166944, 2468367, 10, 593, 115034451827634398,
|
|
114
|
+
73, 200, 183638, 2210923, 10, 583, 365735165000548572,
|
|
115
|
+
74, 200, 202001, 321257, 10, 591, 928479940794929153,
|
|
116
|
+
75, 200, 222201, 8185105, 11, 600, 780163958693677795,
|
|
117
|
+
76, 200, 244421, 6205349, 11, 598, 132454307780236135,
|
|
118
|
+
77, 200, 268863, 3165901, 11, 600, 369824066179493948,
|
|
119
|
+
78, 200, 295749, 2831723, 11, 595, 80968411797441666,
|
|
120
|
+
79, 200, 325323, 464193, 11, 594, 125773061716381917,
|
|
121
|
+
80, 200, 357855, 7499035, 11, 576, 994150328579932916,
|
|
122
|
+
81, 200, 393640, 1514479, 11, 596, 111092193875842594,
|
|
123
|
+
82, 200, 433004, 668493, 12, 607, 497338041653302784,
|
|
124
|
+
83, 200, 476304, 3174931, 12, 606, 845986926165673887,
|
|
125
|
+
84, 200, 523934, 914611, 12, 605, 354993119685278556,
|
|
126
|
+
85, 200, 576327, 7270385, 12, 602, 937679531753465428,
|
|
127
|
+
86, 200, 633959, 1956979, 12, 598, 659413123921208266,
|
|
128
|
+
87, 200, 697354, 3137635, 12, 606, 874228711599628459,
|
|
129
|
+
88, 200, 767089, 214923, 12, 608, 1077644643342432307,
|
|
130
|
+
89, 200, 843797, 3084545, 13, 612, 79317113064339979,
|
|
131
|
+
90, 200, 928176, 7800899, 13, 612, 357414065779796772,
|
|
132
|
+
91, 200, 1020993, 6717253, 13, 615, 532723577905833296,
|
|
133
|
+
92, 200, 1123092, 5543015, 13, 614, 508695073250223746,
|
|
134
|
+
93, 200, 1235401, 298785, 13, 616, 34344606952783179,
|
|
135
|
+
94, 200, 1358941, 4530313, 13, 607, 169924026179364121,
|
|
136
|
+
95, 200, 1494835, 4406457, 13, 612, 1026773494313671061,
|
|
137
|
+
96, 200, 1644318, 1540983, 13, 614, 423454640036650614,
|
|
138
|
+
97, 200, 1808749, 7999631, 14, 624, 466122870338520329,
|
|
139
|
+
98, 200, 1989623, 4295537, 14, 621, 609309853701283445,
|
|
140
|
+
99, 200, 2188585, 7379971, 14, 622, 141739898871015642,
|
|
141
|
+
100, 200, 2407443, 6188931, 14, 621, 22515080776738923,
|
|
142
|
+
101, 200, 2648187, 6701239, 14, 619, 257441864177795548,
|
|
143
|
+
102, 200, 2913005, 2238709, 14, 623, 867028825821064773,
|
|
144
|
+
103, 200, 3204305, 5371075, 14, 625, 1110615471273395112,
|
|
145
|
+
104, 200, 3524735, 7017341, 15, 631, 619518037415974467,
|
|
146
|
+
105, 200, 3877208, 323337, 15, 633, 513230912593541122,
|
|
147
|
+
106, 200, 4264928, 6172471, 15, 628, 885861662583325072,
|
|
148
|
+
107, 200, 4691420, 5653803, 15, 633, 754052473303005204,
|
|
149
|
+
108, 200, 5160562, 1385265, 15, 630, 294993765757975100,
|
|
150
|
+
109, 200, 5676618, 4350899, 15, 617, 1073144684944932303,
|
|
151
|
+
110, 200, 6244279, 1272235, 15, 630, 308982934296855020,
|
|
152
|
+
111, 200, 6868706, 1763939, 16, 638, 356231694823272867,
|
|
153
|
+
112, 200, 7555576, 3703411, 16, 636, 20043268926300101,
|
|
154
|
+
113, 200, 8311133, 6554171, 16, 637, 121111429906734123
|
|
155
|
+
};
|
|
156
|
+
|
|
157
|
+
static std::unique_ptr<int[]> make_input_array(unsigned n, unsigned stride) {
|
|
158
|
+
assert (kll_helper::is_odd(stride));
|
|
159
|
+
unsigned mask((1 << 23) - 1); // because library items are single-precision floats at the moment
|
|
160
|
+
unsigned cur(0);
|
|
161
|
+
std::unique_ptr<int[]> arr(new int[n]);
|
|
162
|
+
for (unsigned i = 0; i < n; i++) {
|
|
163
|
+
cur += stride;
|
|
164
|
+
cur &= mask;
|
|
165
|
+
arr[i] = cur;
|
|
166
|
+
}
|
|
167
|
+
return arr;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
static int64_t simple_hash_of_sub_array(const float* arr, unsigned start, unsigned length) {
|
|
171
|
+
int64_t multiplier(738219921); // an arbitrary odd 30-bit number
|
|
172
|
+
int64_t mask60((1ULL << 60) - 1ULL);
|
|
173
|
+
int64_t accum(0);
|
|
174
|
+
for (unsigned i = start; i < start + length; i++) {
|
|
175
|
+
accum += (int64_t) arr[i];
|
|
176
|
+
accum *= multiplier;
|
|
177
|
+
accum &= mask60;
|
|
178
|
+
accum ^= accum >> 30;
|
|
179
|
+
}
|
|
180
|
+
return accum;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
TEST_CASE("kll validation", "[kll_sketch][validation]") {
|
|
184
|
+
for (unsigned i = 0; i < num_tests; i++) {
|
|
185
|
+
assert (correct_results[7 * i] == i);
|
|
186
|
+
unsigned k(correct_results[7 * i + 1]);
|
|
187
|
+
unsigned n(correct_results[7 * i + 2]);
|
|
188
|
+
unsigned stride(correct_results[7 * i + 3]);
|
|
189
|
+
std::unique_ptr<int[]> input_array = make_input_array(n, stride);
|
|
190
|
+
kll_sketch<float> sketch(k);
|
|
191
|
+
kll_next_offset = 0;
|
|
192
|
+
for (unsigned j = 0; j < n; j++) {
|
|
193
|
+
sketch.update(input_array[j]);
|
|
194
|
+
}
|
|
195
|
+
unsigned num_levels = sketch.get_num_levels();
|
|
196
|
+
unsigned num_samples = sketch.get_num_retained();
|
|
197
|
+
int64_t hashed_samples = simple_hash_of_sub_array(sketch.get_items(), sketch.get_levels()[0], num_samples);
|
|
198
|
+
std::cout << i;
|
|
199
|
+
REQUIRE(correct_results[7 * i + 4] == num_levels);
|
|
200
|
+
REQUIRE(correct_results[7 * i + 5] == num_samples);
|
|
201
|
+
if (correct_results[7 * i + 6] == hashed_samples) {
|
|
202
|
+
std::cout << " pass" << std::endl;
|
|
203
|
+
} else {
|
|
204
|
+
std::cout << " " << (correct_results[7 * i + 6]) << " != " << hashed_samples;
|
|
205
|
+
sketch.to_stream(std::cout);
|
|
206
|
+
FAIL();
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
TEST_CASE("kll validation: test hash", "[kll_sketch][validaiton]") {
|
|
212
|
+
float array[] = { 907500, 944104, 807020, 219921, 678370, 955217, 426885 };
|
|
213
|
+
REQUIRE(simple_hash_of_sub_array(array, 1, 5) == 1141543353991880193LL);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
TEST_CASE("kll validation: make input array", "[kll_sketch][validaiton]") {
|
|
217
|
+
int expected_array[6] = { 3654721, 7309442, 2575555, 6230276, 1496389, 5151110 };
|
|
218
|
+
auto array(make_input_array(6, 3654721));
|
|
219
|
+
REQUIRE(array[0] == expected_array[0]);
|
|
220
|
+
REQUIRE(array[1] == expected_array[1]);
|
|
221
|
+
REQUIRE(array[2] == expected_array[2]);
|
|
222
|
+
REQUIRE(array[3] == expected_array[3]);
|
|
223
|
+
REQUIRE(array[4] == expected_array[4]);
|
|
224
|
+
REQUIRE(array[5] == expected_array[5]);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
} /* namespace datasketches */
|
|
228
|
+
|
|
229
|
+
#endif
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["wheel",
|
|
3
|
+
"setuptools >= 30.3.0",
|
|
4
|
+
"setuptools_scm",
|
|
5
|
+
"cmake >= 3.12"]
|
|
6
|
+
|
|
7
|
+
[tool.tox]
|
|
8
|
+
legacy_tox_ini = """
|
|
9
|
+
[tox]
|
|
10
|
+
envlist = py3
|
|
11
|
+
|
|
12
|
+
[testenv]
|
|
13
|
+
deps = pytest
|
|
14
|
+
numpy
|
|
15
|
+
changedir = python/tests
|
|
16
|
+
commands = pytest
|
|
17
|
+
"""
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
# TODO: Can we force python version >= 3.0?
|
|
19
|
+
if (MSVC)
|
|
20
|
+
set(PYBIND11_CPP_STANDARD /std:c++11)
|
|
21
|
+
else()
|
|
22
|
+
set(PYBIND11_CPP_STANDARD -std=c++11)
|
|
23
|
+
endif()
|
|
24
|
+
|
|
25
|
+
add_subdirectory(pybind11)
|
|
26
|
+
|
|
27
|
+
pybind11_add_module(python MODULE EXCLUDE_FROM_ALL SYSTEM THIN_LTO)
|
|
28
|
+
|
|
29
|
+
target_link_libraries(python
|
|
30
|
+
PRIVATE
|
|
31
|
+
common
|
|
32
|
+
hll
|
|
33
|
+
kll
|
|
34
|
+
cpc
|
|
35
|
+
fi
|
|
36
|
+
theta
|
|
37
|
+
sampling
|
|
38
|
+
pybind11::module
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
set_target_properties(python PROPERTIES
|
|
42
|
+
PREFIX ""
|
|
43
|
+
OUTPUT_NAME datasketches
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# ensure we make a .so on Mac rather than .dylib
|
|
47
|
+
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
|
48
|
+
set_target_properties(python PROPERTIES SUFFIX ".so")
|
|
49
|
+
endif()
|
|
50
|
+
|
|
51
|
+
target_sources(python
|
|
52
|
+
PRIVATE
|
|
53
|
+
src/datasketches.cpp
|
|
54
|
+
src/hll_wrapper.cpp
|
|
55
|
+
src/kll_wrapper.cpp
|
|
56
|
+
src/cpc_wrapper.cpp
|
|
57
|
+
src/fi_wrapper.cpp
|
|
58
|
+
src/theta_wrapper.cpp
|
|
59
|
+
src/vo_wrapper.cpp
|
|
60
|
+
src/vector_of_kll.cpp
|
|
61
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Python Wrapper for Datasketches
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
The release files do not include the needed python binding library ([pybind11](https://github.com/pybind/pybind11)). If building
|
|
6
|
+
from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
|
|
7
|
+
|
|
8
|
+
An official pypi build is eventually planned but not yet available.
|
|
9
|
+
|
|
10
|
+
If you instead want to take a (possibly ill-advised) gamble on the current state of the master branch being useable, you can run:
|
|
11
|
+
```pip install git+https://github.com/apache/datasketches-cpp.git```
|
|
12
|
+
|
|
13
|
+
## Developer Instructions
|
|
14
|
+
|
|
15
|
+
### Building
|
|
16
|
+
|
|
17
|
+
When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
|
|
18
|
+
```
|
|
19
|
+
git clone --recursive https://github.com/apache/datasketches-cpp.git
|
|
20
|
+
cd datasketches-cpp
|
|
21
|
+
python -m pip install --upgrade pip setuptools wheel numpy
|
|
22
|
+
python setup.py build
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
If you cloned without `--recursive`, you can add the submodule post-checkout using `git submodule update --init --recursive`.
|
|
26
|
+
|
|
27
|
+
### Installing
|
|
28
|
+
|
|
29
|
+
Assuming you have already checked out the library and any dependent submodules, install by simply replacing the lsat
|
|
30
|
+
line of the build command with `python setup.py install`.
|
|
31
|
+
|
|
32
|
+
### Unit tests
|
|
33
|
+
|
|
34
|
+
The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
|
|
35
|
+
```
|
|
36
|
+
python -m pip install --upgrade pip setuptools wheel numpy tox
|
|
37
|
+
tox
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
Having installed the library, loading the Datasketches library in Python is simple: `import datasketches`.
|
|
43
|
+
|
|
44
|
+
## Available Sketch Classes
|
|
45
|
+
|
|
46
|
+
- KLL
|
|
47
|
+
- `kll_ints_sketch`
|
|
48
|
+
- `kll_floats_sketch`
|
|
49
|
+
- Frequent Items
|
|
50
|
+
- `frequent_strings_sketch`
|
|
51
|
+
- Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
|
|
52
|
+
- Theta
|
|
53
|
+
- `update_theta_sketch`
|
|
54
|
+
- `compact_theta_sketch` (cannot be instantiated directly)
|
|
55
|
+
- `theta_union`
|
|
56
|
+
- `theta_intersection`
|
|
57
|
+
- `theta_a_not_b`
|
|
58
|
+
- HLL
|
|
59
|
+
- `hll_sketch`
|
|
60
|
+
- `hll_union`
|
|
61
|
+
- Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
|
|
62
|
+
- CPC
|
|
63
|
+
- `cpc_sketch`
|
|
64
|
+
- `cpc_union`
|
|
65
|
+
- VarOpt Sampling
|
|
66
|
+
- `var_opt_sketch`
|
|
67
|
+
- `var_opt_union`
|
|
68
|
+
- Vector of KLL
|
|
69
|
+
- `vector_of_kll_ints_sketches`
|
|
70
|
+
- `vector_of_kll_floats_sketches`
|
|
71
|
+
|
|
72
|
+
## Known Differences from C++
|
|
73
|
+
|
|
74
|
+
The Python API largely mirrors the C++ API, with a few minor exceptions: The primary known differences are that Python on modern platforms does not support unsigned integer values or numeric values with fewer than 64 bits. As a result, you may not be able to produce identical sketches from within Python as you can with Java and C++. Loading those sketches after they have been serialized from another language will work as expected.
|
|
75
|
+
|
|
76
|
+
The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
|
|
77
|
+
|
|
78
|
+
We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"metadata": {},
|
|
6
|
+
"source": [
|
|
7
|
+
"## CPC Sketch Examples"
|
|
8
|
+
]
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"cell_type": "markdown",
|
|
12
|
+
"metadata": {},
|
|
13
|
+
"source": [
|
|
14
|
+
"### Basic Sketch Usage"
|
|
15
|
+
]
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"cell_type": "code",
|
|
19
|
+
"execution_count": 1,
|
|
20
|
+
"metadata": {},
|
|
21
|
+
"outputs": [],
|
|
22
|
+
"source": [
|
|
23
|
+
"from datasketches import cpc_sketch, cpc_union"
|
|
24
|
+
]
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"cell_type": "markdown",
|
|
28
|
+
"metadata": {},
|
|
29
|
+
"source": [
|
|
30
|
+
"We'll create a sketch with log2(k) = 12"
|
|
31
|
+
]
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"cell_type": "code",
|
|
35
|
+
"execution_count": 2,
|
|
36
|
+
"metadata": {},
|
|
37
|
+
"outputs": [],
|
|
38
|
+
"source": [
|
|
39
|
+
"sk = cpc_sketch(12)"
|
|
40
|
+
]
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"cell_type": "markdown",
|
|
44
|
+
"metadata": {},
|
|
45
|
+
"source": [
|
|
46
|
+
"Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes."
|
|
47
|
+
]
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"cell_type": "code",
|
|
51
|
+
"execution_count": 3,
|
|
52
|
+
"metadata": {},
|
|
53
|
+
"outputs": [
|
|
54
|
+
{
|
|
55
|
+
"name": "stdout",
|
|
56
|
+
"output_type": "stream",
|
|
57
|
+
"text": [
|
|
58
|
+
"### CPC sketch summary:\n",
|
|
59
|
+
" lgK : 12\n",
|
|
60
|
+
" seed hash : 93cc\n",
|
|
61
|
+
" C : 38212\n",
|
|
62
|
+
" flavor : 4\n",
|
|
63
|
+
" merged : false\n",
|
|
64
|
+
" compressed : false\n",
|
|
65
|
+
" intresting col : 5\n",
|
|
66
|
+
" HIP estimate : 2.09721e+06\n",
|
|
67
|
+
" kxp : 11.4725\n",
|
|
68
|
+
" offset : 6\n",
|
|
69
|
+
" table : allocated\n",
|
|
70
|
+
" num SV : 135\n",
|
|
71
|
+
" window : allocated\n",
|
|
72
|
+
"### End sketch summary\n",
|
|
73
|
+
"\n"
|
|
74
|
+
]
|
|
75
|
+
}
|
|
76
|
+
],
|
|
77
|
+
"source": [
|
|
78
|
+
"n = 1 << 21\n",
|
|
79
|
+
"for i in range(0, n):\n",
|
|
80
|
+
" sk.update(i)\n",
|
|
81
|
+
"print(sk)"
|
|
82
|
+
]
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"cell_type": "markdown",
|
|
86
|
+
"metadata": {},
|
|
87
|
+
"source": [
|
|
88
|
+
"Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation)."
|
|
89
|
+
]
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"cell_type": "code",
|
|
93
|
+
"execution_count": 4,
|
|
94
|
+
"metadata": {},
|
|
95
|
+
"outputs": [
|
|
96
|
+
{
|
|
97
|
+
"name": "stdout",
|
|
98
|
+
"output_type": "stream",
|
|
99
|
+
"text": [
|
|
100
|
+
"Upper bound (1 std. dev) as % of true value: 100.9281\n"
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
],
|
|
104
|
+
"source": [
|
|
105
|
+
"print(\"Upper bound (1 std. dev) as % of true value: \", round(100*sk.get_upper_bound(1) / n, 4))"
|
|
106
|
+
]
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"cell_type": "code",
|
|
110
|
+
"execution_count": 5,
|
|
111
|
+
"metadata": {},
|
|
112
|
+
"outputs": [
|
|
113
|
+
{
|
|
114
|
+
"name": "stdout",
|
|
115
|
+
"output_type": "stream",
|
|
116
|
+
"text": [
|
|
117
|
+
"Estimate as % of true value: 100.0026\n"
|
|
118
|
+
]
|
|
119
|
+
}
|
|
120
|
+
],
|
|
121
|
+
"source": [
|
|
122
|
+
"print(\"Estimate as % of true value: \", round(100*sk.get_estimate() / n, 4))"
|
|
123
|
+
]
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"cell_type": "code",
|
|
127
|
+
"execution_count": 6,
|
|
128
|
+
"metadata": {},
|
|
129
|
+
"outputs": [
|
|
130
|
+
{
|
|
131
|
+
"name": "stdout",
|
|
132
|
+
"output_type": "stream",
|
|
133
|
+
"text": [
|
|
134
|
+
"Lower bound (1 std. dev) as % of true value: 99.0935\n"
|
|
135
|
+
]
|
|
136
|
+
}
|
|
137
|
+
],
|
|
138
|
+
"source": [
|
|
139
|
+
"print(\"Lower bound (1 std. dev) as % of true value: \", round(100*sk.get_lower_bound(1) / n, 4))"
|
|
140
|
+
]
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
"cell_type": "markdown",
|
|
144
|
+
"metadata": {},
|
|
145
|
+
"source": [
|
|
146
|
+
"Finally, we can serialize and deserialize the sketch, which will give us back the same structure."
|
|
147
|
+
]
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
"cell_type": "code",
|
|
151
|
+
"execution_count": 7,
|
|
152
|
+
"metadata": {},
|
|
153
|
+
"outputs": [
|
|
154
|
+
{
|
|
155
|
+
"data": {
|
|
156
|
+
"text/plain": [
|
|
157
|
+
"2484"
|
|
158
|
+
]
|
|
159
|
+
},
|
|
160
|
+
"execution_count": 7,
|
|
161
|
+
"metadata": {},
|
|
162
|
+
"output_type": "execute_result"
|
|
163
|
+
}
|
|
164
|
+
],
|
|
165
|
+
"source": [
|
|
166
|
+
"sk_bytes = sk.serialize()\n",
|
|
167
|
+
"len(sk_bytes)"
|
|
168
|
+
]
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
"cell_type": "code",
|
|
172
|
+
"execution_count": 8,
|
|
173
|
+
"metadata": {},
|
|
174
|
+
"outputs": [
|
|
175
|
+
{
|
|
176
|
+
"name": "stdout",
|
|
177
|
+
"output_type": "stream",
|
|
178
|
+
"text": [
|
|
179
|
+
"### CPC sketch summary:\n",
|
|
180
|
+
" lgK : 12\n",
|
|
181
|
+
" seed hash : 93cc\n",
|
|
182
|
+
" C : 38212\n",
|
|
183
|
+
" flavor : 4\n",
|
|
184
|
+
" merged : false\n",
|
|
185
|
+
" compressed : false\n",
|
|
186
|
+
" intresting col : 5\n",
|
|
187
|
+
" HIP estimate : 2.09721e+06\n",
|
|
188
|
+
" kxp : 11.4725\n",
|
|
189
|
+
" offset : 6\n",
|
|
190
|
+
" table : allocated\n",
|
|
191
|
+
" num SV : 135\n",
|
|
192
|
+
" window : allocated\n",
|
|
193
|
+
"### End sketch summary\n",
|
|
194
|
+
"\n"
|
|
195
|
+
]
|
|
196
|
+
}
|
|
197
|
+
],
|
|
198
|
+
"source": [
|
|
199
|
+
"sk2 = cpc_sketch.deserialize(sk_bytes)\n",
|
|
200
|
+
"print(sk2)"
|
|
201
|
+
]
|
|
202
|
+
},
|
|
203
|
+
{
|
|
204
|
+
"cell_type": "markdown",
|
|
205
|
+
"metadata": {},
|
|
206
|
+
"source": [
|
|
207
|
+
"### Sketch Union Usage"
|
|
208
|
+
]
|
|
209
|
+
},
|
|
210
|
+
{
|
|
211
|
+
"cell_type": "markdown",
|
|
212
|
+
"metadata": {},
|
|
213
|
+
"source": [
|
|
214
|
+
"Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data."
|
|
215
|
+
]
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
"cell_type": "code",
|
|
219
|
+
"execution_count": 9,
|
|
220
|
+
"metadata": {},
|
|
221
|
+
"outputs": [],
|
|
222
|
+
"source": [
|
|
223
|
+
"k = 12\n",
|
|
224
|
+
"n = 1 << 20\n",
|
|
225
|
+
"offset = int(3 * n / 4)"
|
|
226
|
+
]
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
"cell_type": "code",
|
|
230
|
+
"execution_count": 10,
|
|
231
|
+
"metadata": {},
|
|
232
|
+
"outputs": [],
|
|
233
|
+
"source": [
|
|
234
|
+
"sk1 = cpc_sketch(k)\n",
|
|
235
|
+
"sk2 = cpc_sketch(k + 1)\n",
|
|
236
|
+
"for i in range(0, n):\n",
|
|
237
|
+
" sk1.update(i)\n",
|
|
238
|
+
" sk2.update(i + offset)"
|
|
239
|
+
]
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
"cell_type": "markdown",
|
|
243
|
+
"metadata": {},
|
|
244
|
+
"source": [
|
|
245
|
+
"Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here."
|
|
246
|
+
]
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
"cell_type": "code",
|
|
250
|
+
"execution_count": 11,
|
|
251
|
+
"metadata": {},
|
|
252
|
+
"outputs": [],
|
|
253
|
+
"source": [
|
|
254
|
+
"union = cpc_union(k+1)\n",
|
|
255
|
+
"union.update(sk1)\n",
|
|
256
|
+
"union.update(sk2)"
|
|
257
|
+
]
|
|
258
|
+
},
|
|
259
|
+
{
|
|
260
|
+
"cell_type": "markdown",
|
|
261
|
+
"metadata": {},
|
|
262
|
+
"source": [
|
|
263
|
+
"Note how log config k has automatically adopted the value of the smaller input sketch."
|
|
264
|
+
]
|
|
265
|
+
},
|
|
266
|
+
{
|
|
267
|
+
"cell_type": "code",
|
|
268
|
+
"execution_count": 12,
|
|
269
|
+
"metadata": {},
|
|
270
|
+
"outputs": [
|
|
271
|
+
{
|
|
272
|
+
"name": "stdout",
|
|
273
|
+
"output_type": "stream",
|
|
274
|
+
"text": [
|
|
275
|
+
"### CPC sketch summary:\n",
|
|
276
|
+
" lgK : 12\n",
|
|
277
|
+
" seed hash : 93cc\n",
|
|
278
|
+
" C : 37418\n",
|
|
279
|
+
" flavor : 4\n",
|
|
280
|
+
" merged : true\n",
|
|
281
|
+
" compressed : false\n",
|
|
282
|
+
" intresting col : 5\n",
|
|
283
|
+
" HIP estimate : 0\n",
|
|
284
|
+
" kxp : 4096\n",
|
|
285
|
+
" offset : 6\n",
|
|
286
|
+
" table : allocated\n",
|
|
287
|
+
" num SV : 123\n",
|
|
288
|
+
" window : allocated\n",
|
|
289
|
+
"### End sketch summary\n",
|
|
290
|
+
"\n"
|
|
291
|
+
]
|
|
292
|
+
}
|
|
293
|
+
],
|
|
294
|
+
"source": [
|
|
295
|
+
"result = union.get_result()\n",
|
|
296
|
+
"print(result)"
|
|
297
|
+
]
|
|
298
|
+
},
|
|
299
|
+
{
|
|
300
|
+
"cell_type": "markdown",
|
|
301
|
+
"metadata": {},
|
|
302
|
+
"source": [
|
|
303
|
+
"We can again compare against the exact result, in this case 1.75*n"
|
|
304
|
+
]
|
|
305
|
+
},
|
|
306
|
+
{
|
|
307
|
+
"cell_type": "code",
|
|
308
|
+
"execution_count": 13,
|
|
309
|
+
"metadata": {},
|
|
310
|
+
"outputs": [
|
|
311
|
+
{
|
|
312
|
+
"name": "stdout",
|
|
313
|
+
"output_type": "stream",
|
|
314
|
+
"text": [
|
|
315
|
+
"Estimate as % of true value: 99.6646\n"
|
|
316
|
+
]
|
|
317
|
+
}
|
|
318
|
+
],
|
|
319
|
+
"source": [
|
|
320
|
+
"print(\"Estimate as % of true value: \", round(100*result.get_estimate() / (7*n/4), 4))"
|
|
321
|
+
]
|
|
322
|
+
}
|
|
323
|
+
],
|
|
324
|
+
"metadata": {
|
|
325
|
+
"kernelspec": {
|
|
326
|
+
"display_name": "Python 3",
|
|
327
|
+
"language": "python",
|
|
328
|
+
"name": "python3"
|
|
329
|
+
},
|
|
330
|
+
"language_info": {
|
|
331
|
+
"codemirror_mode": {
|
|
332
|
+
"name": "ipython",
|
|
333
|
+
"version": 3
|
|
334
|
+
},
|
|
335
|
+
"file_extension": ".py",
|
|
336
|
+
"mimetype": "text/x-python",
|
|
337
|
+
"name": "python",
|
|
338
|
+
"nbconvert_exporter": "python",
|
|
339
|
+
"pygments_lexer": "ipython3",
|
|
340
|
+
"version": "3.7.0"
|
|
341
|
+
}
|
|
342
|
+
},
|
|
343
|
+
"nbformat": 4,
|
|
344
|
+
"nbformat_minor": 2
|
|
345
|
+
}
|