datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
import unittest
|
|
19
|
+
from datasketches import (kll_ints_sketch, kll_floats_sketch,
|
|
20
|
+
vector_of_kll_ints_sketches,
|
|
21
|
+
vector_of_kll_floats_sketches)
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
class KllTest(unittest.TestCase):
|
|
25
|
+
def test_kll_example(self):
|
|
26
|
+
k = 160
|
|
27
|
+
n = 2 ** 20
|
|
28
|
+
|
|
29
|
+
# create a sketch and inject ~1 million N(0,1) points as an array and as a single item
|
|
30
|
+
kll = kll_floats_sketch(k)
|
|
31
|
+
kll.update(np.random.normal(size=n-1))
|
|
32
|
+
kll.update(0.0)
|
|
33
|
+
|
|
34
|
+
# 0 should be near the median
|
|
35
|
+
self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.025)
|
|
36
|
+
|
|
37
|
+
# the median should be near 0
|
|
38
|
+
self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.025)
|
|
39
|
+
|
|
40
|
+
# we also track the min/max independently from the rest of the data
|
|
41
|
+
# which lets us know the full observed data range
|
|
42
|
+
self.assertLessEqual(kll.get_min_value(), kll.get_quantile(0.01))
|
|
43
|
+
self.assertLessEqual(0.0, kll.get_rank(kll.get_min_value()))
|
|
44
|
+
self.assertGreaterEqual(kll.get_max_value(), kll.get_quantile(0.99))
|
|
45
|
+
self.assertGreaterEqual(1.0, kll.get_rank(kll.get_max_value()))
|
|
46
|
+
|
|
47
|
+
# we can also extract a list of values at a time,
|
|
48
|
+
# here the values should give us something close to [-2, -1, 0, 1, 2].
|
|
49
|
+
# then get the CDF, which will return something close to
|
|
50
|
+
# the original values used in get_quantiles()
|
|
51
|
+
# finally, can check the normalized rank error bound
|
|
52
|
+
pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
|
|
53
|
+
cdf = kll.get_cdf(pts) # include 1.0 at end to account for all probability mass
|
|
54
|
+
self.assertEqual(len(cdf), len(pts)+1)
|
|
55
|
+
err = kll.normalized_rank_error(False)
|
|
56
|
+
self.assertEqual(err, kll_floats_sketch.get_normalized_rank_error(k, False))
|
|
57
|
+
|
|
58
|
+
# and a few basic queries about the sketch
|
|
59
|
+
self.assertFalse(kll.is_empty())
|
|
60
|
+
self.assertTrue(kll.is_estimation_mode())
|
|
61
|
+
self.assertEqual(kll.get_n(), n)
|
|
62
|
+
self.assertLess(kll.get_num_retained(), n)
|
|
63
|
+
|
|
64
|
+
# merging itself will double the number of items the sketch has seen
|
|
65
|
+
kll.merge(kll)
|
|
66
|
+
self.assertEqual(kll.get_n(), 2*n)
|
|
67
|
+
|
|
68
|
+
# we can then serialize and reconstruct the sketch
|
|
69
|
+
kll_bytes = kll.serialize()
|
|
70
|
+
new_kll = kll.deserialize(kll_bytes)
|
|
71
|
+
self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
|
|
72
|
+
self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
|
|
73
|
+
self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
|
|
74
|
+
self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
|
|
75
|
+
self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0))
|
|
76
|
+
|
|
77
|
+
def test_kll_ints_sketch(self):
|
|
78
|
+
k = 100
|
|
79
|
+
n = 10
|
|
80
|
+
kll = kll_ints_sketch(k)
|
|
81
|
+
for i in range(0, n):
|
|
82
|
+
kll.update(i)
|
|
83
|
+
|
|
84
|
+
self.assertEqual(kll.get_min_value(), 0)
|
|
85
|
+
self.assertEqual(kll.get_max_value(), n-1)
|
|
86
|
+
self.assertEqual(kll.get_n(), n)
|
|
87
|
+
self.assertFalse(kll.is_empty())
|
|
88
|
+
self.assertFalse(kll.is_estimation_mode()) # n < k
|
|
89
|
+
|
|
90
|
+
pmf = kll.get_pmf([round(n/2)])
|
|
91
|
+
self.assertIsNotNone(pmf)
|
|
92
|
+
self.assertEqual(len(pmf), 2)
|
|
93
|
+
|
|
94
|
+
cdf = kll.get_cdf([round(n/2)])
|
|
95
|
+
self.assertIsNotNone(cdf)
|
|
96
|
+
self.assertEqual(len(cdf), 2)
|
|
97
|
+
|
|
98
|
+
self.assertEqual(kll.get_quantile(0.5), round(n/2))
|
|
99
|
+
quants = kll.get_quantiles([0.25, 0.5, 0.75])
|
|
100
|
+
self.assertIsNotNone(quants)
|
|
101
|
+
self.assertEqual(len(quants), 3)
|
|
102
|
+
|
|
103
|
+
self.assertEqual(kll.get_rank(round(n/2)), 0.5)
|
|
104
|
+
|
|
105
|
+
# merge self
|
|
106
|
+
kll.merge(kll)
|
|
107
|
+
self.assertEqual(kll.get_n(), 2 * n)
|
|
108
|
+
|
|
109
|
+
sk_bytes = kll.serialize()
|
|
110
|
+
self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))
|
|
111
|
+
|
|
112
|
+
def test_kll_floats_sketch(self):
|
|
113
|
+
# already tested ints and it's templatized, so just make sure it instantiates properly
|
|
114
|
+
k = 75
|
|
115
|
+
kll = kll_floats_sketch(k)
|
|
116
|
+
self.assertTrue(kll.is_empty())
|
|
117
|
+
|
|
118
|
+
if __name__ == '__main__':
|
|
119
|
+
unittest.main()
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
import unittest
|
|
19
|
+
|
|
20
|
+
from datasketches import theta_sketch, update_theta_sketch
|
|
21
|
+
from datasketches import compact_theta_sketch, theta_union
|
|
22
|
+
from datasketches import theta_intersection, theta_a_not_b
|
|
23
|
+
|
|
24
|
+
class ThetaTest(unittest.TestCase):
|
|
25
|
+
def test_theta_basic_example(self):
|
|
26
|
+
k = 12 # 2^k = 4096 rows in the table
|
|
27
|
+
n = 1 << 18 # ~256k unique values
|
|
28
|
+
|
|
29
|
+
# create a sketch and inject some values
|
|
30
|
+
sk = self.generate_theta_sketch(n, k)
|
|
31
|
+
|
|
32
|
+
# we can check that the upper and lower bounds bracket the
|
|
33
|
+
# estimate, without needing to know the exact value.
|
|
34
|
+
self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
|
|
35
|
+
self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
|
|
36
|
+
|
|
37
|
+
# because this sketch is deterministically generated, we can
|
|
38
|
+
# also compare against the exact value
|
|
39
|
+
self.assertLessEqual(sk.get_lower_bound(1), n)
|
|
40
|
+
self.assertGreaterEqual(sk.get_upper_bound(1), n)
|
|
41
|
+
|
|
42
|
+
# serialize for storage and reconstruct
|
|
43
|
+
sk_bytes = sk.serialize()
|
|
44
|
+
new_sk = update_theta_sketch.deserialize(sk_bytes)
|
|
45
|
+
|
|
46
|
+
# estimate remains unchanged
|
|
47
|
+
self.assertFalse(sk.is_empty())
|
|
48
|
+
self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
|
|
49
|
+
|
|
50
|
+
def test_theta_set_operations(self):
|
|
51
|
+
k = 12 # 2^k = 4096 rows in the table
|
|
52
|
+
n = 1 << 18 # ~256k unique values
|
|
53
|
+
|
|
54
|
+
# we'll have 1/4 of the values overlap
|
|
55
|
+
offset = int(3 * n / 4) # it's a float w/o cast
|
|
56
|
+
|
|
57
|
+
# create a couple sketches and inject some values
|
|
58
|
+
sk1 = self.generate_theta_sketch(n, k)
|
|
59
|
+
sk2 = self.generate_theta_sketch(n, k, offset)
|
|
60
|
+
|
|
61
|
+
# UNIONS
|
|
62
|
+
# create a union object
|
|
63
|
+
union = theta_union(k)
|
|
64
|
+
union.update(sk1)
|
|
65
|
+
union.update(sk2)
|
|
66
|
+
|
|
67
|
+
# getting result from union returns a compact_theta_sketch
|
|
68
|
+
# compact theta sketches can be used in additional unions
|
|
69
|
+
# or set operations but cannot accept further item updates
|
|
70
|
+
result = union.get_result()
|
|
71
|
+
self.assertTrue(isinstance(result, compact_theta_sketch))
|
|
72
|
+
|
|
73
|
+
# since our process here is deterministic, we have
|
|
74
|
+
# checked and know the exact answer is within one
|
|
75
|
+
# standard deviation of the estimate
|
|
76
|
+
self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
|
|
77
|
+
self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# INTERSECTIONS
|
|
81
|
+
# create an intersection object
|
|
82
|
+
intersect = theta_intersection() # no lg_k
|
|
83
|
+
intersect.update(sk1)
|
|
84
|
+
intersect.update(sk2)
|
|
85
|
+
|
|
86
|
+
# has_result() indicates the intersection has been used,
|
|
87
|
+
# although the result may be the empty set
|
|
88
|
+
self.assertTrue(intersect.has_result())
|
|
89
|
+
|
|
90
|
+
# as with unions, the result is a compact sketch
|
|
91
|
+
result = intersect.get_result()
|
|
92
|
+
self.assertTrue(isinstance(result, compact_theta_sketch))
|
|
93
|
+
|
|
94
|
+
# we know the sets overlap by 1/4
|
|
95
|
+
self.assertLessEqual(result.get_lower_bound(1), n / 4)
|
|
96
|
+
self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# A NOT B
|
|
100
|
+
# create an a_not_b object
|
|
101
|
+
anb = theta_a_not_b() # no lg_k
|
|
102
|
+
result = anb.compute(sk1, sk2)
|
|
103
|
+
|
|
104
|
+
# as with unions, the result is a compact sketch
|
|
105
|
+
self.assertTrue(isinstance(result, compact_theta_sketch))
|
|
106
|
+
|
|
107
|
+
# we know the sets overlap by 1/4, so the remainder is 3/4
|
|
108
|
+
self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
|
|
109
|
+
self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def generate_theta_sketch(self, n, k, offset=0):
|
|
113
|
+
sk = update_theta_sketch(k)
|
|
114
|
+
for i in range(0, n):
|
|
115
|
+
sk.update(i + offset)
|
|
116
|
+
return sk
|
|
117
|
+
|
|
118
|
+
if __name__ == '__main__':
|
|
119
|
+
unittest.main()
|
|
120
|
+
|
|
121
|
+
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
import unittest
|
|
19
|
+
from datasketches import (vector_of_kll_ints_sketches,
|
|
20
|
+
vector_of_kll_floats_sketches)
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
class VectorOfKllSketchesTest(unittest.TestCase):
|
|
24
|
+
def test_vector_of_kll_floats_sketches_example(self):
|
|
25
|
+
k = 200
|
|
26
|
+
d = 3
|
|
27
|
+
n = 2 ** 20
|
|
28
|
+
|
|
29
|
+
# create a sketch and inject ~1 million N(0,1) points
|
|
30
|
+
kll = vector_of_kll_floats_sketches(k, d)
|
|
31
|
+
# Track the min/max for each sketch to test later
|
|
32
|
+
smin = np.zeros(d) + np.inf
|
|
33
|
+
smax = np.zeros(d) - np.inf
|
|
34
|
+
|
|
35
|
+
for i in range(0, n):
|
|
36
|
+
dat = np.random.randn(d)
|
|
37
|
+
smin = np.amin([smin, dat], axis=0)
|
|
38
|
+
smax = np.amax([smax, dat], axis=0)
|
|
39
|
+
kll.update(dat)
|
|
40
|
+
|
|
41
|
+
# 0 should be near the median
|
|
42
|
+
np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.025)
|
|
43
|
+
# the median should be near 0
|
|
44
|
+
np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.025)
|
|
45
|
+
# we also track the min/max independently from the rest of the data
|
|
46
|
+
# which lets us know the full observed data range
|
|
47
|
+
np.testing.assert_allclose(kll.get_min_values(), smin)
|
|
48
|
+
np.testing.assert_allclose(kll.get_max_values(), smax)
|
|
49
|
+
np.testing.assert_array_less(kll.get_min_values(), kll.get_quantiles(0.01)[:,0])
|
|
50
|
+
np.testing.assert_array_less(kll.get_quantiles(0.99)[:,0], kll.get_max_values())
|
|
51
|
+
|
|
52
|
+
# we can also extract a list of values at a time,
|
|
53
|
+
# here the values should give us something close to [-2, -1, 0, 1, 2].
|
|
54
|
+
# then get the CDF, which will return something close to
|
|
55
|
+
# the original values used in get_quantiles()
|
|
56
|
+
# finally, can check the normalized rank error bound
|
|
57
|
+
pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
|
|
58
|
+
# use the mean pts for the CDF, include 1.0 at end to account for all probability mass
|
|
59
|
+
meanpts = np.mean(pts, axis=0)
|
|
60
|
+
cdf = kll.get_cdf(meanpts)
|
|
61
|
+
self.assertEqual(cdf.shape[0], pts.shape[0])
|
|
62
|
+
self.assertEqual(cdf.shape[1], pts.shape[1]+1)
|
|
63
|
+
|
|
64
|
+
# and a few basic queries about the sketch
|
|
65
|
+
self.assertFalse(np.all(kll.is_empty()))
|
|
66
|
+
self.assertTrue(np.all(kll.is_estimation_mode()))
|
|
67
|
+
self.assertTrue(np.all(kll.get_n() == n))
|
|
68
|
+
self.assertTrue(np.all(kll.get_num_retained() < n))
|
|
69
|
+
|
|
70
|
+
# we can combine sketches across all dimensions and get the reuslt
|
|
71
|
+
result = kll.collapse()
|
|
72
|
+
self.assertEqual(result.get_n(), d * n)
|
|
73
|
+
|
|
74
|
+
# merging a copy of itself will double the number of items the sketch has seen
|
|
75
|
+
kll_copy = vector_of_kll_floats_sketches(kll)
|
|
76
|
+
kll.merge(kll_copy)
|
|
77
|
+
np.testing.assert_equal(kll.get_n(), 2*n)
|
|
78
|
+
|
|
79
|
+
# we can then serialize and reconstruct the sketch
|
|
80
|
+
kll_bytes = kll.serialize() # serializes each sketch as a list
|
|
81
|
+
new_kll = vector_of_kll_floats_sketches(k, d)
|
|
82
|
+
for s in range(len(kll_bytes)):
|
|
83
|
+
new_kll.deserialize(kll_bytes[s], s)
|
|
84
|
+
|
|
85
|
+
# everything should be exactly equal
|
|
86
|
+
np.testing.assert_equal(kll.get_num_retained(), new_kll.get_num_retained())
|
|
87
|
+
np.testing.assert_equal;(kll.get_min_values(), new_kll.get_min_values())
|
|
88
|
+
np.testing.assert_equal(kll.get_max_values(), new_kll.get_max_values())
|
|
89
|
+
np.testing.assert_equal(kll.get_quantiles(0.7), new_kll.get_quantiles(0.7))
|
|
90
|
+
np.testing.assert_equal(kll.get_ranks(0.0), new_kll.get_ranks(0.0))
|
|
91
|
+
|
|
92
|
+
def test_kll_ints_sketches(self):
|
|
93
|
+
# already tested floats and it's templatized, so just make sure it instantiates properly
|
|
94
|
+
k = 100
|
|
95
|
+
d = 5
|
|
96
|
+
kll = vector_of_kll_ints_sketches(k, d)
|
|
97
|
+
self.assertTrue(np.all(kll.is_empty()))
|
|
98
|
+
|
|
99
|
+
def test_kll_2Dupdates(self):
|
|
100
|
+
# 1D case tested in the first example
|
|
101
|
+
# 2D case will follow same idea, but focusing on update()
|
|
102
|
+
k = 200
|
|
103
|
+
d = 3
|
|
104
|
+
# we'll do ~250k updates of 4 values each (total ~1mil updates, as above)
|
|
105
|
+
n = 2 ** 18
|
|
106
|
+
nbatch = 4
|
|
107
|
+
|
|
108
|
+
# create a sketch and inject ~1 million N(0,1) points
|
|
109
|
+
kll = vector_of_kll_floats_sketches(k, d)
|
|
110
|
+
# Track the min/max for each sketch to test later
|
|
111
|
+
smin = np.zeros(d) + np.inf
|
|
112
|
+
smax = np.zeros(d) - np.inf
|
|
113
|
+
|
|
114
|
+
for i in range(0, n):
|
|
115
|
+
dat = np.random.randn(nbatch, d)
|
|
116
|
+
smin = np.amin(np.row_stack((smin, dat)), axis=0)
|
|
117
|
+
smax = np.amax(np.row_stack((smax, dat)), axis=0)
|
|
118
|
+
kll.update(dat)
|
|
119
|
+
|
|
120
|
+
# 0 should be near the median
|
|
121
|
+
np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.025)
|
|
122
|
+
# the median should be near 0
|
|
123
|
+
np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.025)
|
|
124
|
+
# we also track the min/max independently from the rest of the data
|
|
125
|
+
# which lets us know the full observed data range
|
|
126
|
+
np.testing.assert_allclose(kll.get_min_values(), smin)
|
|
127
|
+
np.testing.assert_allclose(kll.get_max_values(), smax)
|
|
128
|
+
|
|
129
|
+
def test_kll_3Dupdates(self):
|
|
130
|
+
# now test 3D update, which should fail
|
|
131
|
+
k = 200
|
|
132
|
+
d = 3
|
|
133
|
+
|
|
134
|
+
# create a sketch
|
|
135
|
+
kll = vector_of_kll_floats_sketches(k, d)
|
|
136
|
+
|
|
137
|
+
# we'll try 1 3D update
|
|
138
|
+
dat = np.random.randn(10, 7, d)
|
|
139
|
+
try:
|
|
140
|
+
kll.update(dat)
|
|
141
|
+
except:
|
|
142
|
+
# this is what we expect
|
|
143
|
+
pass
|
|
144
|
+
# the sketches should still be empty
|
|
145
|
+
self.assertTrue(np.all(kll.is_empty()))
|
|
146
|
+
|
|
147
|
+
if __name__ == '__main__':
|
|
148
|
+
unittest.main()
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
import unittest
|
|
19
|
+
from datasketches import var_opt_sketch, var_opt_union
|
|
20
|
+
|
|
21
|
+
class VoTest(unittest.TestCase):
|
|
22
|
+
def test_vo_example(self):
|
|
23
|
+
k = 50 # a small value so we can easily fill the sketch
|
|
24
|
+
vo = var_opt_sketch(k)
|
|
25
|
+
|
|
26
|
+
# varopt sampling reduces to standard reservoir sampling
|
|
27
|
+
# if the items are all equally weighted, although the
|
|
28
|
+
# algorithm will be significantly slower than an optimized
|
|
29
|
+
# reservoir sampler
|
|
30
|
+
n = 5 * k
|
|
31
|
+
for i in range(0, n):
|
|
32
|
+
vo.update(i)
|
|
33
|
+
|
|
34
|
+
# we can also add a heavy item, using a negative weight for
|
|
35
|
+
# easy filtering later. keep in mind that "heavy" is a
|
|
36
|
+
# relative concept, so using a fixed multiple of n may not
|
|
37
|
+
# be considered a heavy item for larger values of n
|
|
38
|
+
vo.update(-1, 1000 * n)
|
|
39
|
+
self.assertEqual(k, vo.k)
|
|
40
|
+
self.assertEqual(k, vo.num_samples)
|
|
41
|
+
self.assertEqual(n + 1, vo.n)
|
|
42
|
+
self.assertFalse(vo.is_empty())
|
|
43
|
+
|
|
44
|
+
# we can easily get the list of items in the sample
|
|
45
|
+
items = vo.get_samples()
|
|
46
|
+
self.assertEqual(len(items), k)
|
|
47
|
+
|
|
48
|
+
# we can also apply a predicate to the sketch to get an estimate
|
|
49
|
+
# (with optimially minimal variance) of the subset sum of items
|
|
50
|
+
# matching that predicate among the entire population
|
|
51
|
+
|
|
52
|
+
# we'll use a lambda here, but any function operating on a single
|
|
53
|
+
# item which returns a boolean value should work
|
|
54
|
+
summary = vo.estimate_subset_sum(lambda x: x < 0)
|
|
55
|
+
self.assertEqual(summary['estimate'], 1000 * n)
|
|
56
|
+
self.assertEqual(summary['total_sketch_weight'], 1001 * n)
|
|
57
|
+
|
|
58
|
+
# a regular function is similarly handled
|
|
59
|
+
def geq_zero(x):
|
|
60
|
+
return x >= 0
|
|
61
|
+
summary = vo.estimate_subset_sum(geq_zero)
|
|
62
|
+
self.assertEqual(summary['estimate'], n)
|
|
63
|
+
self.assertEqual(summary['total_sketch_weight'], 1001 * n)
|
|
64
|
+
|
|
65
|
+
# next we'll create a second, smaller sketch with
|
|
66
|
+
# only heavier items relative to the previous sketch,
|
|
67
|
+
# but with the sketch in sampling mode
|
|
68
|
+
k2 = 5
|
|
69
|
+
vo2 = var_opt_sketch(k2)
|
|
70
|
+
# for weight, use the estimate of all items >=0 from before
|
|
71
|
+
wt = summary['estimate']
|
|
72
|
+
for i in range(0, k2 + 1):
|
|
73
|
+
vo2.update((2 * n) + i, wt)
|
|
74
|
+
|
|
75
|
+
# now union the sketches, demonstrating how the
|
|
76
|
+
# union's k may not be equal to that of either
|
|
77
|
+
# input value
|
|
78
|
+
union = var_opt_union(k)
|
|
79
|
+
union.update(vo)
|
|
80
|
+
union.update(vo2)
|
|
81
|
+
|
|
82
|
+
result = union.get_result()
|
|
83
|
+
self.assertEqual(n + k2 + 2, result.n)
|
|
84
|
+
self.assertFalse(result.is_empty())
|
|
85
|
+
self.assertGreater(result.k, k2)
|
|
86
|
+
self.assertLess(result.k, k)
|
|
87
|
+
|
|
88
|
+
# we can compare what information is available from both
|
|
89
|
+
# the union and a sketch.
|
|
90
|
+
print(union)
|
|
91
|
+
|
|
92
|
+
# if we want to print the list of itmes, there must be a
|
|
93
|
+
# __str__() method for each item (which need not be the same
|
|
94
|
+
# type; they're all generic python objects when used from
|
|
95
|
+
# python), otherwise you may trigger an exception.
|
|
96
|
+
# to_string() is provided as a convenince to avoid direct
|
|
97
|
+
# calls to __str__() with parameters.
|
|
98
|
+
print(result.to_string(True))
|
|
99
|
+
|
|
100
|
+
if __name__ == '__main__':
|
|
101
|
+
unittest.main()
|