datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
#include <hll.hpp>
|
|
23
|
+
|
|
24
|
+
namespace datasketches {
|
|
25
|
+
|
|
26
|
+
// assume the binary sketches for this test have been generated by datasketches-java code
|
|
27
|
+
// in the subdirectory called "java" in the root directory of this project
|
|
28
|
+
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
|
|
29
|
+
|
|
30
|
+
TEST_CASE("hll4 sketch", "[serde_compat]") {
|
|
31
|
+
const unsigned n_arr[] = {0, 10, 100, 1000, 10000, 100000, 1000000};
|
|
32
|
+
for (const unsigned n: n_arr) {
|
|
33
|
+
std::ifstream is;
|
|
34
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
35
|
+
is.open(testBinaryInputPath + "hll4_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
36
|
+
const auto sketch = hll_sketch::deserialize(is);
|
|
37
|
+
REQUIRE(sketch.get_lg_config_k() == 12);
|
|
38
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
39
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
TEST_CASE("hll6 sketch", "[serde_compat]") {
|
|
44
|
+
const unsigned n_arr[] = {0, 10, 100, 1000, 10000, 100000, 1000000};
|
|
45
|
+
for (const unsigned n: n_arr) {
|
|
46
|
+
std::ifstream is;
|
|
47
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
48
|
+
is.open(testBinaryInputPath + "hll6_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
49
|
+
const auto sketch = hll_sketch::deserialize(is);
|
|
50
|
+
REQUIRE(sketch.get_lg_config_k() == 12);
|
|
51
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
52
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
TEST_CASE("hll8 sketch", "[serde_compat]") {
|
|
57
|
+
const unsigned n_arr[] = {0, 10, 100, 1000, 10000, 100000, 1000000};
|
|
58
|
+
for (const unsigned n: n_arr) {
|
|
59
|
+
std::ifstream is;
|
|
60
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
61
|
+
is.open(testBinaryInputPath + "hll8_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
|
62
|
+
const auto sketch = hll_sketch::deserialize(is);
|
|
63
|
+
REQUIRE(sketch.get_lg_config_k() == 12);
|
|
64
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
65
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
#include <hll.hpp>
|
|
23
|
+
|
|
24
|
+
namespace datasketches {
|
|
25
|
+
|
|
26
|
+
TEST_CASE("hll sketch generate", "[serialize_for_java]") {
|
|
27
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
28
|
+
for (const unsigned n: n_arr) {
|
|
29
|
+
hll_sketch hll4(12, HLL_4);
|
|
30
|
+
hll_sketch hll6(12, HLL_6);
|
|
31
|
+
hll_sketch hll8(12, HLL_8);
|
|
32
|
+
for (unsigned i = 0; i < n; ++i) {
|
|
33
|
+
hll4.update(i);
|
|
34
|
+
hll6.update(i);
|
|
35
|
+
hll8.update(i);
|
|
36
|
+
}
|
|
37
|
+
{
|
|
38
|
+
std::ofstream os("hll4_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
39
|
+
hll4.serialize_compact(os);
|
|
40
|
+
}
|
|
41
|
+
{
|
|
42
|
+
std::ofstream os("hll6_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
43
|
+
hll6.serialize_compact(os);
|
|
44
|
+
}
|
|
45
|
+
{
|
|
46
|
+
std::ofstream os("hll8_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
47
|
+
hll8.serialize_compact(os);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
} /* namespace datasketches */
|
|
@@ -99,7 +99,7 @@ void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) {
|
|
|
99
99
|
#ifdef KLL_VALIDATION
|
|
100
100
|
const uint32_t offset = deterministic_offset();
|
|
101
101
|
#else
|
|
102
|
-
const uint32_t offset = random_bit();
|
|
102
|
+
const uint32_t offset = random_utils::random_bit();
|
|
103
103
|
#endif
|
|
104
104
|
uint32_t j = start + offset;
|
|
105
105
|
for (uint32_t i = start; i < (start + half_length); i++) {
|
|
@@ -115,7 +115,7 @@ void kll_helper::randomly_halve_up(T* buf, uint32_t start, uint32_t length) {
|
|
|
115
115
|
#ifdef KLL_VALIDATION
|
|
116
116
|
const uint32_t offset = deterministic_offset();
|
|
117
117
|
#else
|
|
118
|
-
const uint32_t offset = random_bit();
|
|
118
|
+
const uint32_t offset = random_utils::random_bit();
|
|
119
119
|
#endif
|
|
120
120
|
uint32_t j = (start + length) - 1 - offset;
|
|
121
121
|
for (uint32_t i = (start + length) - 1; i >= (start + half_length); i--) {
|
|
@@ -26,10 +26,22 @@
|
|
|
26
26
|
#include "common_defs.hpp"
|
|
27
27
|
#include "serde.hpp"
|
|
28
28
|
#include "quantiles_sorted_view.hpp"
|
|
29
|
+
#include "optional.hpp"
|
|
29
30
|
|
|
30
31
|
namespace datasketches {
|
|
31
32
|
|
|
32
|
-
|
|
33
|
+
/// KLL sketch constants
|
|
34
|
+
namespace kll_constants {
|
|
35
|
+
/// default value of parameter K
|
|
36
|
+
const uint16_t DEFAULT_K = 200;
|
|
37
|
+
const uint8_t DEFAULT_M = 8;
|
|
38
|
+
/// min value of parameter K
|
|
39
|
+
const uint16_t MIN_K = DEFAULT_M;
|
|
40
|
+
/// max value of parameter K
|
|
41
|
+
const uint16_t MAX_K = (1 << 16) - 1;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
33
45
|
* Implementation of a very compact quantiles sketch with lazy compaction scheme
|
|
34
46
|
* and nearly optimal accuracy per retained item.
|
|
35
47
|
* See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
|
|
@@ -146,10 +158,6 @@ namespace datasketches {
|
|
|
146
158
|
* author Lee Rhodes
|
|
147
159
|
*/
|
|
148
160
|
|
|
149
|
-
namespace kll_constants {
|
|
150
|
-
const uint16_t DEFAULT_K = 200;
|
|
151
|
-
}
|
|
152
|
-
|
|
153
161
|
template <
|
|
154
162
|
typename T,
|
|
155
163
|
typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
|
|
@@ -159,17 +167,51 @@ class kll_sketch {
|
|
|
159
167
|
public:
|
|
160
168
|
using value_type = T;
|
|
161
169
|
using comparator = C;
|
|
170
|
+
using allocator_type = A;
|
|
162
171
|
using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
|
|
172
|
+
using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
|
|
163
173
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
174
|
+
/**
|
|
175
|
+
* Quantile return type.
|
|
176
|
+
* This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
|
|
177
|
+
*/
|
|
178
|
+
using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
|
|
167
179
|
|
|
180
|
+
/**
|
|
181
|
+
* Constructor
|
|
182
|
+
* @param k affects the size of the sketch and its estimation error
|
|
183
|
+
* @param comparator strict weak ordering function (see C++ named requirements: Compare)
|
|
184
|
+
* @param allocator used by this sketch to allocate memory
|
|
185
|
+
*/
|
|
168
186
|
explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const C& comparator = C(), const A& allocator = A());
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Copy constructor
|
|
190
|
+
* @param other sketch to be copied
|
|
191
|
+
*/
|
|
169
192
|
kll_sketch(const kll_sketch& other);
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Move constructor
|
|
196
|
+
* @param other sketch to be moved
|
|
197
|
+
*/
|
|
170
198
|
kll_sketch(kll_sketch&& other) noexcept;
|
|
199
|
+
|
|
200
|
+
|
|
171
201
|
~kll_sketch();
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Copy assignment
|
|
205
|
+
* @param other sketch to be copied
|
|
206
|
+
* @return reference to this sketch
|
|
207
|
+
*/
|
|
172
208
|
kll_sketch& operator=(const kll_sketch& other);
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Move assignment
|
|
212
|
+
* @param other sketch to be moved
|
|
213
|
+
* @return reference to this sketch
|
|
214
|
+
*/
|
|
173
215
|
kll_sketch& operator=(kll_sketch&& other);
|
|
174
216
|
|
|
175
217
|
/*
|
|
@@ -262,44 +304,8 @@ class kll_sketch {
|
|
|
262
304
|
*
|
|
263
305
|
* @return approximate quantile associated with the given rank
|
|
264
306
|
*/
|
|
265
|
-
using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
|
|
266
307
|
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
|
|
267
308
|
|
|
268
|
-
/**
|
|
269
|
-
* This returns an array that could have been generated by using get_quantile() for each
|
|
270
|
-
* rank separately.
|
|
271
|
-
*
|
|
272
|
-
* <p>If the sketch is empty this throws std::runtime_error.
|
|
273
|
-
*
|
|
274
|
-
* @param ranks given array of ranks in the hypothetical sorted stream.
|
|
275
|
-
* These ranks must be in the interval [0.0, 1.0].
|
|
276
|
-
* @param size the number of ranks in the array
|
|
277
|
-
* @param inclusive if true, the given ranks are considered inclusive (include weights of items)
|
|
278
|
-
*
|
|
279
|
-
* @return array of approximate quantiles corresponding to the given ranks in the same order.
|
|
280
|
-
*
|
|
281
|
-
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
|
|
282
|
-
*/
|
|
283
|
-
std::vector<T, A> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
|
|
284
|
-
|
|
285
|
-
/**
|
|
286
|
-
* This is a multiple-query version of get_quantile() that allows the caller to
|
|
287
|
-
* specify the number of evenly-spaced ranks.
|
|
288
|
-
*
|
|
289
|
-
* <p>If the sketch is empty this throws std::runtime_error.
|
|
290
|
-
*
|
|
291
|
-
* @param num an integer that specifies the number of evenly-spaced ranks.
|
|
292
|
-
* This must be an integer greater than 0. A value of 1 will return the quantile of rank 0.
|
|
293
|
-
* A value of 2 will return quantiles of ranks 0 and 1. A value of 3 will return quantiles of ranks 0,
|
|
294
|
-
* 0.5 (median) and 1, etc.
|
|
295
|
-
* @param inclusive if true, the ranks are considered inclusive (include weights of items)
|
|
296
|
-
*
|
|
297
|
-
* @return array of approximate quantiles corresponding to the given number of evenly-spaced ranks.
|
|
298
|
-
*
|
|
299
|
-
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
|
|
300
|
-
*/
|
|
301
|
-
std::vector<T, A> get_quantiles(uint32_t num, bool inclusive = true) const;
|
|
302
|
-
|
|
303
309
|
/**
|
|
304
310
|
* Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
|
|
305
311
|
*
|
|
@@ -339,7 +345,6 @@ class kll_sketch {
|
|
|
339
345
|
* @return an array of m+1 doubles each of which is an approximation
|
|
340
346
|
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
|
|
341
347
|
*/
|
|
342
|
-
using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
|
|
343
348
|
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
|
344
349
|
|
|
345
350
|
/**
|
|
@@ -489,9 +494,26 @@ class kll_sketch {
|
|
|
489
494
|
string<A> to_string(bool print_levels = false, bool print_items = false) const;
|
|
490
495
|
|
|
491
496
|
class const_iterator;
|
|
497
|
+
|
|
498
|
+
/**
|
|
499
|
+
* Iterator pointing to the first item in the sketch.
|
|
500
|
+
* If the sketch is empty, the returned iterator must not be dereferenced or incremented.
|
|
501
|
+
* @return iterator pointing to the first item in the sketch
|
|
502
|
+
*/
|
|
492
503
|
const_iterator begin() const;
|
|
504
|
+
|
|
505
|
+
/**
|
|
506
|
+
* Iterator pointing to the past-the-end item in the sketch.
|
|
507
|
+
* The past-the-end item is the hypothetical item that would follow the last item.
|
|
508
|
+
* It does not point to any item, and must not be dereferenced or incremented.
|
|
509
|
+
* @return iterator pointing to the past-the-end item in the sketch
|
|
510
|
+
*/
|
|
493
511
|
const_iterator end() const;
|
|
494
512
|
|
|
513
|
+
/**
|
|
514
|
+
* Gets the sorted view of this sketch
|
|
515
|
+
* @return the sorted view of this sketch
|
|
516
|
+
*/
|
|
495
517
|
quantiles_sorted_view<T, C, A> get_sorted_view() const;
|
|
496
518
|
|
|
497
519
|
private:
|
|
@@ -529,16 +551,15 @@ class kll_sketch {
|
|
|
529
551
|
vector_u32 levels_;
|
|
530
552
|
T* items_;
|
|
531
553
|
uint32_t items_size_;
|
|
532
|
-
T
|
|
533
|
-
T
|
|
554
|
+
optional<T> min_item_;
|
|
555
|
+
optional<T> max_item_;
|
|
534
556
|
mutable quantiles_sorted_view<T, C, A>* sorted_view_;
|
|
535
557
|
|
|
536
558
|
// for deserialization
|
|
537
|
-
class item_deleter;
|
|
538
559
|
class items_deleter;
|
|
539
560
|
kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
|
|
540
|
-
std::unique_ptr<T, items_deleter> items, uint32_t items_size,
|
|
541
|
-
|
|
561
|
+
std::unique_ptr<T, items_deleter> items, uint32_t items_size, optional<T>&& min_item,
|
|
562
|
+
optional<T>&& max_item, bool is_level_zero_sorted, const C& comparator);
|
|
542
563
|
|
|
543
564
|
// common update code
|
|
544
565
|
inline void update_min_max(const T& item);
|