datasketches 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +1 -1
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +7 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
- metadata +25 -9
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9eaa8a17efdbc591b3e56f94650e887babd30dc79d95db3a7986df0261184191
|
|
4
|
+
data.tar.gz: 5544326a0edf165d87373a680d8bf5b80acba2894b9048f92cbdb261fcd66d57
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5a28c093ecda083762367149800770f59fee8e630c0d983d3f29ed32d027fae2e2515dff243ee11bbd41f4875c7cea622f7bc5cc5d7e73176e785503ed19fc0b
|
|
7
|
+
data.tar.gz: 6b210f2fdca1ae3cbd4e4cbf88e284855014b5a1e1c883085dc96a057da29e370005163ce628e54351c9127b00fae4b7b33a4ca63e6f4b90e0665e93b7742a66
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
|
@@ -33,7 +33,11 @@ void bind_kll_sketch(Rice::Module& m, const char* name) {
|
|
|
33
33
|
.define_method("estimation_mode?", &kll_sketch<T>::is_estimation_mode)
|
|
34
34
|
.define_method("min_value", &kll_sketch<T>::get_min_value)
|
|
35
35
|
.define_method("max_value", &kll_sketch<T>::get_max_value)
|
|
36
|
-
.define_method(
|
|
36
|
+
.define_method(
|
|
37
|
+
"quantile",
|
|
38
|
+
[](kll_sketch<T>& self, double fraction) {
|
|
39
|
+
return self.get_quantile(fraction);
|
|
40
|
+
})
|
|
37
41
|
.define_method(
|
|
38
42
|
"quantiles",
|
|
39
43
|
[](kll_sketch<T>& self, Rice::Object obj) {
|
data/lib/datasketches/version.rb
CHANGED
|
@@ -15,9 +15,9 @@
|
|
|
15
15
|
# specific language governing permissions and limitations
|
|
16
16
|
# under the License.
|
|
17
17
|
|
|
18
|
-
cmake_minimum_required(VERSION 3.
|
|
18
|
+
cmake_minimum_required(VERSION 3.16.0)
|
|
19
19
|
project(DataSketches
|
|
20
|
-
VERSION 3.
|
|
20
|
+
VERSION 3.4.0
|
|
21
21
|
LANGUAGES CXX)
|
|
22
22
|
|
|
23
23
|
include(GNUInstallDirs)
|
|
@@ -106,12 +106,13 @@ add_subdirectory(theta)
|
|
|
106
106
|
add_subdirectory(sampling)
|
|
107
107
|
add_subdirectory(tuple)
|
|
108
108
|
add_subdirectory(req)
|
|
109
|
+
add_subdirectory(quantiles)
|
|
109
110
|
|
|
110
111
|
if (WITH_PYTHON)
|
|
111
112
|
add_subdirectory(python)
|
|
112
113
|
endif()
|
|
113
114
|
|
|
114
|
-
target_link_libraries(datasketches INTERFACE hll cpc kll fi theta sampling)
|
|
115
|
+
target_link_libraries(datasketches INTERFACE hll cpc kll fi theta sampling req quantiles)
|
|
115
116
|
|
|
116
117
|
if (COVERAGE)
|
|
117
118
|
find_program(LCOV_PATH NAMES "lcov")
|
|
@@ -43,4 +43,8 @@ install(FILES
|
|
|
43
43
|
include/conditional_forward.hpp
|
|
44
44
|
include/ceiling_power_of_2.hpp
|
|
45
45
|
include/bounds_binomial_proportions.hpp
|
|
46
|
+
include/kolmogorov_smirnov.hpp
|
|
47
|
+
include/kolmogorov_smirnov_impl.hpp
|
|
48
|
+
include/quantile_sketch_sorted_view.hpp
|
|
49
|
+
include/quantile_sketch_sorted_view_impl.hpp
|
|
46
50
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
@@ -24,6 +24,8 @@
|
|
|
24
24
|
#include <string>
|
|
25
25
|
#include <memory>
|
|
26
26
|
#include <iostream>
|
|
27
|
+
#include <random>
|
|
28
|
+
#include <chrono>
|
|
27
29
|
|
|
28
30
|
namespace datasketches {
|
|
29
31
|
|
|
@@ -34,6 +36,18 @@ enum resize_factor { X1 = 0, X2, X4, X8 };
|
|
|
34
36
|
template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
|
|
35
37
|
template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
|
|
36
38
|
|
|
39
|
+
// random bit
|
|
40
|
+
static std::independent_bits_engine<std::mt19937, 1, uint32_t>
|
|
41
|
+
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()));
|
|
42
|
+
|
|
43
|
+
// common random declarations
|
|
44
|
+
namespace random_utils {
|
|
45
|
+
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
|
|
46
|
+
static std::mt19937_64 rand(rd());
|
|
47
|
+
static std::uniform_real_distribution<> next_double(0.0, 1.0);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
37
51
|
// utility function to hide unused compiler warning
|
|
38
52
|
// usually has no additional cost
|
|
39
53
|
template<typename T> void unused(T&&...) {}
|
|
@@ -25,7 +25,8 @@ namespace datasketches {
|
|
|
25
25
|
class kolmogorov_smirnov {
|
|
26
26
|
public:
|
|
27
27
|
/**
|
|
28
|
-
* Computes the raw delta area between two
|
|
28
|
+
* Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
|
|
29
|
+
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
|
|
29
30
|
* @param sketch1 KLL sketch 1
|
|
30
31
|
* @param sketch2 KLL sketch 2
|
|
31
32
|
* @return the raw delta between two KLL quantile sketches
|
|
@@ -37,6 +38,7 @@ public:
|
|
|
37
38
|
* Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
|
|
38
39
|
* Adjusts the computed threshold by the error epsilons of the two given sketches.
|
|
39
40
|
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
|
|
41
|
+
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
|
|
40
42
|
* @param sketch1 KLL sketch 1
|
|
41
43
|
* @param sketch2 KLL sketch 2
|
|
42
44
|
* @param p Target p-value. Typically .001 to .1, e.g., .05.
|
|
@@ -46,7 +48,8 @@ public:
|
|
|
46
48
|
static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
|
|
47
49
|
|
|
48
50
|
/**
|
|
49
|
-
* Performs the Kolmogorov-Smirnov Test between two
|
|
51
|
+
* Performs the Kolmogorov-Smirnov Test between two quantile sketches.
|
|
52
|
+
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
|
|
50
53
|
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
|
|
51
54
|
* this will return false.
|
|
52
55
|
* @param sketch1 KLL sketch 1
|
|
@@ -57,7 +60,6 @@ public:
|
|
|
57
60
|
*/
|
|
58
61
|
template<typename Sketch>
|
|
59
62
|
static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
|
|
60
|
-
|
|
61
63
|
};
|
|
62
64
|
|
|
63
65
|
} /* namespace datasketches */
|
|
@@ -20,39 +20,36 @@
|
|
|
20
20
|
#ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
|
|
21
21
|
#define KOLMOGOROV_SMIRNOV_IMPL_HPP_
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
#include <cmath>
|
|
24
|
+
#include <algorithm>
|
|
24
25
|
|
|
25
|
-
|
|
26
|
-
template<typename T, typename C, typename S, typename A>
|
|
27
|
-
kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
|
|
28
|
-
return kll_quantile_calculator<T, C, A>(sketch);
|
|
29
|
-
}
|
|
26
|
+
namespace datasketches {
|
|
30
27
|
|
|
31
28
|
template<typename Sketch>
|
|
32
29
|
double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
|
|
33
|
-
|
|
34
|
-
auto
|
|
35
|
-
auto
|
|
36
|
-
auto it1 =
|
|
37
|
-
auto it2 =
|
|
30
|
+
auto comparator = sketch1.get_comparator(); // assuming the same comparator in sketch2
|
|
31
|
+
auto view1 = sketch1.get_sorted_view(true);
|
|
32
|
+
auto view2 = sketch2.get_sorted_view(true);
|
|
33
|
+
auto it1 = view1.begin();
|
|
34
|
+
auto it2 = view2.begin();
|
|
38
35
|
const auto n1 = sketch1.get_n();
|
|
39
36
|
const auto n2 = sketch2.get_n();
|
|
40
37
|
double delta = 0;
|
|
41
|
-
while (it1 !=
|
|
38
|
+
while (it1 != view1.end() && it2 != view2.end()) {
|
|
42
39
|
const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
|
|
43
40
|
const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
|
|
44
41
|
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
|
|
45
|
-
if (
|
|
42
|
+
if (comparator((*it1).first, (*it2).first)) {
|
|
46
43
|
++it1;
|
|
47
|
-
} else if (
|
|
44
|
+
} else if (comparator((*it2).first, (*it1).first)) {
|
|
48
45
|
++it2;
|
|
49
46
|
} else {
|
|
50
47
|
++it1;
|
|
51
48
|
++it2;
|
|
52
49
|
}
|
|
53
50
|
}
|
|
54
|
-
const double norm_cum_wt1 = it1 ==
|
|
55
|
-
const double norm_cum_wt2 = it2 ==
|
|
51
|
+
const double norm_cum_wt1 = it1 == view1.end() ? 1 : static_cast<double>((*it1).second) / n1;
|
|
52
|
+
const double norm_cum_wt2 = it2 == view2.end() ? 1 : static_cast<double>((*it2).second) / n2;
|
|
56
53
|
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
|
|
57
54
|
return delta;
|
|
58
55
|
}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef QUANTILE_SKETCH_SORTED_VIEW_HPP_
|
|
21
|
+
#define QUANTILE_SKETCH_SORTED_VIEW_HPP_
|
|
22
|
+
|
|
23
|
+
#include <functional>
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
template<
|
|
28
|
+
typename T,
|
|
29
|
+
typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
|
|
30
|
+
typename Allocator
|
|
31
|
+
>
|
|
32
|
+
class quantile_sketch_sorted_view {
|
|
33
|
+
public:
|
|
34
|
+
using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
|
|
35
|
+
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
|
|
36
|
+
using Container = std::vector<Entry, AllocEntry>;
|
|
37
|
+
|
|
38
|
+
quantile_sketch_sorted_view(uint32_t num, const Allocator& allocator);
|
|
39
|
+
|
|
40
|
+
template<typename Iterator>
|
|
41
|
+
void add(Iterator begin, Iterator end, uint64_t weight);
|
|
42
|
+
|
|
43
|
+
template<bool inclusive>
|
|
44
|
+
void convert_to_cummulative();
|
|
45
|
+
|
|
46
|
+
class const_iterator;
|
|
47
|
+
const_iterator begin() const;
|
|
48
|
+
const_iterator end() const;
|
|
49
|
+
|
|
50
|
+
size_t size() const;
|
|
51
|
+
|
|
52
|
+
// makes sense only with cumulative weight
|
|
53
|
+
using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
|
|
54
|
+
quantile_return_type get_quantile(double rank) const;
|
|
55
|
+
|
|
56
|
+
private:
|
|
57
|
+
static inline const T& deref_helper(const T* t) { return *t; }
|
|
58
|
+
static inline T deref_helper(T t) { return t; }
|
|
59
|
+
|
|
60
|
+
struct compare_pairs_by_first {
|
|
61
|
+
bool operator()(const Entry& a, const Entry& b) const {
|
|
62
|
+
return Comparator()(deref_helper(a.first), deref_helper(b.first));
|
|
63
|
+
}
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
struct compare_pairs_by_second {
|
|
67
|
+
bool operator()(const Entry& a, const Entry& b) const {
|
|
68
|
+
return a.second < b.second;
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
73
|
+
static inline T ref_helper(const T& t) { return t; }
|
|
74
|
+
|
|
75
|
+
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
76
|
+
static inline const T* ref_helper(const T& t) { return std::addressof(t); }
|
|
77
|
+
|
|
78
|
+
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
79
|
+
static inline Entry make_dummy_entry(uint64_t weight) { return Entry(0, weight); }
|
|
80
|
+
|
|
81
|
+
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
82
|
+
static inline Entry make_dummy_entry(uint64_t weight) { return Entry(nullptr, weight); }
|
|
83
|
+
|
|
84
|
+
uint64_t total_weight_;
|
|
85
|
+
Container entries_;
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
template<typename T, typename C, typename A>
|
|
89
|
+
class quantile_sketch_sorted_view<T, C, A>::const_iterator: public quantile_sketch_sorted_view<T, C, A>::Container::const_iterator {
|
|
90
|
+
public:
|
|
91
|
+
using Base = typename quantile_sketch_sorted_view<T, C, A>::Container::const_iterator;
|
|
92
|
+
using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
|
|
93
|
+
|
|
94
|
+
const_iterator(const Base& it): Base(it) {}
|
|
95
|
+
|
|
96
|
+
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
97
|
+
value_type operator*() const { return Base::operator*(); }
|
|
98
|
+
|
|
99
|
+
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
100
|
+
value_type operator*() const { return value_type(*(Base::operator*().first), Base::operator*().second); }
|
|
101
|
+
|
|
102
|
+
class return_value_holder {
|
|
103
|
+
public:
|
|
104
|
+
return_value_holder(value_type value): value_(value) {}
|
|
105
|
+
const value_type* operator->() const { return &value_; }
|
|
106
|
+
private:
|
|
107
|
+
value_type value_;
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
111
|
+
const value_type* operator->() const { return Base::operator->(); }
|
|
112
|
+
|
|
113
|
+
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
114
|
+
return_value_holder operator->() const { return **this; }
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
} /* namespace datasketches */
|
|
118
|
+
|
|
119
|
+
#include "quantile_sketch_sorted_view_impl.hpp"
|
|
120
|
+
|
|
121
|
+
#endif
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef QUANTILE_SKETCH_SORTED_VIEW_IMPL_HPP_
|
|
21
|
+
#define QUANTILE_SKETCH_SORTED_VIEW_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <algorithm>
|
|
24
|
+
#include <stdexcept>
|
|
25
|
+
|
|
26
|
+
namespace datasketches {
|
|
27
|
+
|
|
28
|
+
template<typename T, typename C, typename A>
|
|
29
|
+
quantile_sketch_sorted_view<T, C, A>::quantile_sketch_sorted_view(uint32_t num, const A& allocator):
|
|
30
|
+
total_weight_(0),
|
|
31
|
+
entries_(allocator)
|
|
32
|
+
{
|
|
33
|
+
entries_.reserve(num);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
template<typename T, typename C, typename A>
|
|
37
|
+
template<typename Iterator>
|
|
38
|
+
void quantile_sketch_sorted_view<T, C, A>::add(Iterator first, Iterator last, uint64_t weight) {
|
|
39
|
+
const size_t size_before = entries_.size();
|
|
40
|
+
for (auto it = first; it != last; ++it) entries_.push_back(Entry(ref_helper(*it), weight));
|
|
41
|
+
if (size_before > 0) {
|
|
42
|
+
Container tmp(entries_.get_allocator());
|
|
43
|
+
tmp.reserve(entries_.capacity());
|
|
44
|
+
std::merge(
|
|
45
|
+
entries_.begin(), entries_.begin() + size_before,
|
|
46
|
+
entries_.begin() + size_before, entries_.end(),
|
|
47
|
+
std::back_inserter(tmp), compare_pairs_by_first()
|
|
48
|
+
);
|
|
49
|
+
std::swap(tmp, entries_);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
template<typename T, typename C, typename A>
|
|
54
|
+
template<bool inclusive>
|
|
55
|
+
void quantile_sketch_sorted_view<T, C, A>::convert_to_cummulative() {
|
|
56
|
+
uint64_t subtotal = 0;
|
|
57
|
+
for (auto& entry: entries_) {
|
|
58
|
+
const uint64_t new_subtotal = subtotal + entry.second;
|
|
59
|
+
entry.second = inclusive ? new_subtotal : subtotal;
|
|
60
|
+
subtotal = new_subtotal;
|
|
61
|
+
}
|
|
62
|
+
total_weight_ = subtotal;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
template<typename T, typename C, typename A>
|
|
66
|
+
auto quantile_sketch_sorted_view<T, C, A>::get_quantile(double rank) const -> quantile_return_type {
|
|
67
|
+
if (total_weight_ == 0) throw std::invalid_argument("supported for cumulative weight only");
|
|
68
|
+
uint64_t weight = static_cast<uint64_t>(rank * total_weight_);
|
|
69
|
+
auto it = std::lower_bound(entries_.begin(), entries_.end(), make_dummy_entry<T>(weight), compare_pairs_by_second());
|
|
70
|
+
if (it == entries_.end()) return deref_helper(entries_[entries_.size() - 1].first);
|
|
71
|
+
return deref_helper(it->first);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
template<typename T, typename C, typename A>
|
|
75
|
+
auto quantile_sketch_sorted_view<T, C, A>::begin() const -> const_iterator {
|
|
76
|
+
return entries_.begin();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
template<typename T, typename C, typename A>
|
|
80
|
+
auto quantile_sketch_sorted_view<T, C, A>::end() const -> const_iterator {
|
|
81
|
+
return entries_.end();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
template<typename T, typename C, typename A>
|
|
85
|
+
size_t quantile_sketch_sorted_view<T, C, A>::size() const {
|
|
86
|
+
return entries_.size();
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
} /* namespace datasketches */
|
|
90
|
+
|
|
91
|
+
#endif
|
|
@@ -46,7 +46,7 @@ template<
|
|
|
46
46
|
typename W = uint64_t,
|
|
47
47
|
typename H = std::hash<T>,
|
|
48
48
|
typename E = std::equal_to<T>,
|
|
49
|
-
typename S = serde<T>,
|
|
49
|
+
typename S = serde<T>, // deprecated, to be removed in the next major version
|
|
50
50
|
typename A = std::allocator<T>
|
|
51
51
|
>
|
|
52
52
|
class frequent_items_sketch {
|
|
@@ -225,46 +225,78 @@ public:
|
|
|
225
225
|
/**
|
|
226
226
|
* Computes size needed to serialize the current state of the sketch.
|
|
227
227
|
* This can be expensive since every item needs to be looked at.
|
|
228
|
+
* @param instance of a SerDe
|
|
228
229
|
* @return size in bytes needed to serialize this sketch
|
|
229
230
|
*/
|
|
230
|
-
|
|
231
|
+
template<typename SerDe = S>
|
|
232
|
+
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
|
231
233
|
|
|
232
234
|
/**
|
|
233
235
|
* This method serializes the sketch into a given stream in a binary form
|
|
234
236
|
* @param os output stream
|
|
237
|
+
* @param instance of a SerDe
|
|
235
238
|
*/
|
|
236
|
-
|
|
239
|
+
template<typename SerDe = S>
|
|
240
|
+
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
|
237
241
|
|
|
238
242
|
// This is a convenience alias for users
|
|
239
243
|
// The type returned by the following serialize method
|
|
240
244
|
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
241
245
|
|
|
242
|
-
|
|
243
246
|
/**
|
|
244
247
|
* This method serializes the sketch as a vector of bytes.
|
|
245
248
|
* An optional header can be reserved in front of the sketch.
|
|
246
249
|
* It is a blank space of a given size.
|
|
247
250
|
* This header is used in Datasketches PostgreSQL extension.
|
|
248
251
|
* @param header_size_bytes space to reserve in front of the sketch
|
|
252
|
+
* @param instance of a SerDe
|
|
249
253
|
* @return serialized sketch as a vector of bytes
|
|
250
254
|
*/
|
|
251
|
-
|
|
255
|
+
template<typename SerDe = S>
|
|
256
|
+
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
|
252
257
|
|
|
253
258
|
/**
|
|
254
259
|
* This method deserializes a sketch from a given stream.
|
|
255
260
|
* @param is input stream
|
|
261
|
+
* @param instance of an Allocator
|
|
256
262
|
* @return an instance of the sketch
|
|
263
|
+
*
|
|
264
|
+
* Deprecated, to be removed in the next major version
|
|
257
265
|
*/
|
|
258
266
|
static frequent_items_sketch deserialize(std::istream& is, const A& allocator = A());
|
|
259
267
|
|
|
268
|
+
/**
|
|
269
|
+
* This method deserializes a sketch from a given stream.
|
|
270
|
+
* @param is input stream
|
|
271
|
+
* @param instance of a SerDe
|
|
272
|
+
* @param instance of an Allocator
|
|
273
|
+
* @return an instance of the sketch
|
|
274
|
+
*/
|
|
275
|
+
template<typename SerDe = S>
|
|
276
|
+
static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
|
|
277
|
+
|
|
260
278
|
/**
|
|
261
279
|
* This method deserializes a sketch from a given array of bytes.
|
|
262
280
|
* @param bytes pointer to the array of bytes
|
|
263
281
|
* @param size the size of the array
|
|
282
|
+
* @param instance of an Allocator
|
|
264
283
|
* @return an instance of the sketch
|
|
284
|
+
*
|
|
285
|
+
* Deprecated, to be removed in the next major version
|
|
265
286
|
*/
|
|
266
287
|
static frequent_items_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
|
|
267
288
|
|
|
289
|
+
/**
|
|
290
|
+
* This method deserializes a sketch from a given array of bytes.
|
|
291
|
+
* @param bytes pointer to the array of bytes
|
|
292
|
+
* @param size the size of the array
|
|
293
|
+
* @param instance of a SerDe
|
|
294
|
+
* @param instance of an Allocator
|
|
295
|
+
* @return an instance of the sketch
|
|
296
|
+
*/
|
|
297
|
+
template<typename SerDe = S>
|
|
298
|
+
static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
|
|
299
|
+
|
|
268
300
|
/**
|
|
269
301
|
* Returns a human readable summary of this sketch
|
|
270
302
|
* @param print_items if true include the list of items retained by the sketch
|