datasketches 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d8db863a37a8fa081bff6bf269666cdff6d4e8a4cf860a0fafac235858709f62
|
|
4
|
+
data.tar.gz: a858071aae33a8aeb5d92cec1f4fdf4cc3cb5d12b07ed629152c953674c06dff
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 03acf7acb3ecb617713d3549e289ed5829f55bd65a52c28eaf1a603cc2e9e577f7d7ffdebd230b01d259d1116ebbef06640a1a9b1d00baa8d0e970cd31357923
|
|
7
|
+
data.tar.gz: eb4730c27379a392b330cbdf9cf17d6d1207307e46ca2c0203b1e4eab237218115824249646f5916737af01b96b3451a373ce51c2020576af49a1da31a5070b5
|
data/CHANGELOG.md
CHANGED
|
@@ -26,43 +26,43 @@ namespace Rice::detail
|
|
|
26
26
|
template<typename T>
|
|
27
27
|
void bind_kll_sketch(Rice::Module& m, const char* name) {
|
|
28
28
|
Rice::define_class_under<kll_sketch<T>>(m, name)
|
|
29
|
-
.define_constructor(Rice::Constructor<kll_sketch<T>, uint16_t>(), Rice::Arg("k")=
|
|
29
|
+
.define_constructor(Rice::Constructor<kll_sketch<T>, uint16_t>(), Rice::Arg("k")=datasketches::kll_constants::DEFAULT_K)
|
|
30
30
|
.define_method("empty?", &kll_sketch<T>::is_empty)
|
|
31
31
|
.define_method("n", &kll_sketch<T>::get_n)
|
|
32
32
|
.define_method("num_retained", &kll_sketch<T>::get_num_retained)
|
|
33
33
|
.define_method("estimation_mode?", &kll_sketch<T>::is_estimation_mode)
|
|
34
|
-
.define_method("min_value", &kll_sketch<T>::
|
|
35
|
-
.define_method("max_value", &kll_sketch<T>::
|
|
34
|
+
.define_method("min_value", &kll_sketch<T>::get_min_item)
|
|
35
|
+
.define_method("max_value", &kll_sketch<T>::get_max_item)
|
|
36
36
|
.define_method(
|
|
37
37
|
"quantile",
|
|
38
|
-
[](kll_sketch<T>& self, double
|
|
39
|
-
return self.get_quantile(
|
|
40
|
-
})
|
|
38
|
+
[](kll_sketch<T>& self, double rank, bool inclusive) {
|
|
39
|
+
return self.get_quantile(rank, inclusive);
|
|
40
|
+
}, Rice::Arg("rank"), Rice::Arg("inclusive")=false)
|
|
41
41
|
.define_method(
|
|
42
42
|
"quantiles",
|
|
43
|
-
[](kll_sketch<T>& self, Rice::Object obj) {
|
|
43
|
+
[](kll_sketch<T>& self, Rice::Object obj, bool inclusive) {
|
|
44
44
|
if (obj.is_a(rb_cArray)) {
|
|
45
|
-
auto
|
|
46
|
-
return self.get_quantiles(&
|
|
45
|
+
auto ranks = Rice::detail::From_Ruby<std::vector<double>>().convert(obj);
|
|
46
|
+
return self.get_quantiles(&ranks[0], ranks.size(), inclusive);
|
|
47
47
|
} else {
|
|
48
|
-
return self.get_quantiles(Rice::detail::From_Ruby<size_t>().convert(obj));
|
|
48
|
+
return self.get_quantiles(Rice::detail::From_Ruby<size_t>().convert(obj), inclusive);
|
|
49
49
|
}
|
|
50
|
-
})
|
|
50
|
+
}, Rice::Arg("obj"), Rice::Arg("inclusive")=false)
|
|
51
51
|
.define_method(
|
|
52
52
|
"rank",
|
|
53
|
-
[](kll_sketch<T>& self, const T item) {
|
|
54
|
-
return self.get_rank(item);
|
|
55
|
-
})
|
|
53
|
+
[](kll_sketch<T>& self, const T item, bool inclusive) {
|
|
54
|
+
return self.get_rank(item, inclusive);
|
|
55
|
+
}, Rice::Arg("item"), Rice::Arg("inclusive")=false)
|
|
56
56
|
.define_method(
|
|
57
57
|
"pmf",
|
|
58
|
-
[](kll_sketch<T>& self, const std::vector<T>& split_points) {
|
|
59
|
-
return self.get_PMF(&split_points[0], split_points.size());
|
|
60
|
-
})
|
|
58
|
+
[](kll_sketch<T>& self, const std::vector<T>& split_points, bool inclusive) {
|
|
59
|
+
return self.get_PMF(&split_points[0], split_points.size(), inclusive);
|
|
60
|
+
}, Rice::Arg("split_points"), Rice::Arg("inclusive")=false)
|
|
61
61
|
.define_method(
|
|
62
62
|
"cdf",
|
|
63
|
-
[](kll_sketch<T>& self, const std::vector<T>& split_points) {
|
|
64
|
-
return self.get_CDF(&split_points[0], split_points.size());
|
|
65
|
-
})
|
|
63
|
+
[](kll_sketch<T>& self, const std::vector<T>& split_points, bool inclusive) {
|
|
64
|
+
return self.get_CDF(&split_points[0], split_points.size(), inclusive);
|
|
65
|
+
}, Rice::Arg("split_points"), Rice::Arg("inclusive")=false)
|
|
66
66
|
.define_method(
|
|
67
67
|
"merge",
|
|
68
68
|
[](kll_sketch<T>& self, const kll_sketch<T>& other) {
|
|
@@ -59,7 +59,7 @@ void init_theta(Rice::Module& m) {
|
|
|
59
59
|
builder.set_seed(seed);
|
|
60
60
|
return builder.build();
|
|
61
61
|
},
|
|
62
|
-
Arg("lg_k")=
|
|
62
|
+
Arg("lg_k")=datasketches::theta_constants::DEFAULT_LG_K, Arg("p")=1.0, Arg("seed")=DEFAULT_SEED)
|
|
63
63
|
.define_method("compact", &update_theta_sketch::compact, Arg("ordered")=true)
|
|
64
64
|
.define_method(
|
|
65
65
|
"update",
|
|
@@ -88,7 +88,7 @@ void init_theta(Rice::Module& m) {
|
|
|
88
88
|
builder.set_seed(seed);
|
|
89
89
|
return builder.build();
|
|
90
90
|
},
|
|
91
|
-
Arg("lg_k")=
|
|
91
|
+
Arg("lg_k")=datasketches::theta_constants::DEFAULT_LG_K, Arg("p")=1.0, Arg("seed")=DEFAULT_SEED)
|
|
92
92
|
.define_method("update", &theta_union::update<const theta_sketch&>)
|
|
93
93
|
.define_method("result", &theta_union::get_result, Arg("ordered")=true);
|
|
94
94
|
|
data/lib/datasketches/version.rb
CHANGED
|
@@ -16,10 +16,18 @@
|
|
|
16
16
|
# under the License.
|
|
17
17
|
|
|
18
18
|
cmake_minimum_required(VERSION 3.16.0)
|
|
19
|
+
|
|
20
|
+
string(TIMESTAMP DT %Y%m%d UTC)
|
|
21
|
+
string(TIMESTAMP HHMM %H%M UTC)
|
|
22
|
+
configure_file(version.cfg.in version.cfg @ONLY)
|
|
23
|
+
file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/version.cfg BASE_VERSION)
|
|
24
|
+
|
|
19
25
|
project(DataSketches
|
|
20
|
-
VERSION
|
|
26
|
+
VERSION ${BASE_VERSION}
|
|
21
27
|
LANGUAGES CXX)
|
|
22
28
|
|
|
29
|
+
message("Configuring DataSketches version ${BASE_VERSION}")
|
|
30
|
+
|
|
23
31
|
include(GNUInstallDirs)
|
|
24
32
|
include(CMakeDependentOption)
|
|
25
33
|
|
|
@@ -1,11 +1,27 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
1
18
|
global-include CMakeLists.txt
|
|
2
19
|
global-include *.cpp
|
|
3
20
|
global-include *.c
|
|
4
21
|
global-include *.hpp
|
|
5
22
|
global-include *.h
|
|
6
23
|
global-include *.bin
|
|
7
|
-
|
|
8
|
-
global-exclude .git*
|
|
24
|
+
global-include *.in
|
|
9
25
|
|
|
10
26
|
graft cmake
|
|
11
27
|
graft common
|
|
@@ -18,3 +34,6 @@ graft theta
|
|
|
18
34
|
graft tuple
|
|
19
35
|
graft sampling
|
|
20
36
|
graft python
|
|
37
|
+
|
|
38
|
+
# exclusions appear after including subdirectories
|
|
39
|
+
prune build
|
|
@@ -17,6 +17,8 @@
|
|
|
17
17
|
|
|
18
18
|
add_library(common INTERFACE)
|
|
19
19
|
|
|
20
|
+
configure_file(include/version.hpp.in include/version.hpp @ONLY)
|
|
21
|
+
|
|
20
22
|
if (BUILD_TESTS)
|
|
21
23
|
add_subdirectory(test)
|
|
22
24
|
endif()
|
|
@@ -32,6 +34,7 @@ target_compile_features(common INTERFACE cxx_std_11)
|
|
|
32
34
|
install(TARGETS common EXPORT ${PROJECT_NAME})
|
|
33
35
|
|
|
34
36
|
install(FILES
|
|
37
|
+
${CMAKE_CURRENT_BINARY_DIR}/include/version.hpp
|
|
35
38
|
include/common_defs.hpp
|
|
36
39
|
include/memory_operations.hpp
|
|
37
40
|
include/MurmurHash3.h
|
|
@@ -43,8 +46,8 @@ install(FILES
|
|
|
43
46
|
include/conditional_forward.hpp
|
|
44
47
|
include/ceiling_power_of_2.hpp
|
|
45
48
|
include/bounds_binomial_proportions.hpp
|
|
46
|
-
include/
|
|
47
|
-
include/
|
|
49
|
+
include/quantiles_sorted_view.hpp
|
|
50
|
+
include/quantiles_sorted_view_impl.hpp
|
|
48
51
|
include/kolmogorov_smirnov.hpp
|
|
49
52
|
include/kolmogorov_smirnov_impl.hpp
|
|
50
53
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
@@ -86,6 +86,16 @@ static inline void write(std::ostream& os, const T* ptr, size_t size_bytes) {
|
|
|
86
86
|
os.write(reinterpret_cast<const char*>(ptr), size_bytes);
|
|
87
87
|
}
|
|
88
88
|
|
|
89
|
+
// wrapper for iterators to implement operator-> returning temporary value
|
|
90
|
+
template<typename T>
|
|
91
|
+
class return_value_holder {
|
|
92
|
+
public:
|
|
93
|
+
return_value_holder(T value): value_(value) {}
|
|
94
|
+
const T* operator->() const { return std::addressof(value_); }
|
|
95
|
+
private:
|
|
96
|
+
T value_;
|
|
97
|
+
};
|
|
98
|
+
|
|
89
99
|
} // namespace
|
|
90
100
|
|
|
91
101
|
#endif // _COMMON_DEFS_HPP_
|
|
@@ -28,16 +28,16 @@ namespace datasketches {
|
|
|
28
28
|
template<typename Sketch>
|
|
29
29
|
double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
|
|
30
30
|
auto comparator = sketch1.get_comparator(); // assuming the same comparator in sketch2
|
|
31
|
-
auto view1 = sketch1.get_sorted_view(
|
|
32
|
-
auto view2 = sketch2.get_sorted_view(
|
|
31
|
+
auto view1 = sketch1.get_sorted_view();
|
|
32
|
+
auto view2 = sketch2.get_sorted_view();
|
|
33
33
|
auto it1 = view1.begin();
|
|
34
34
|
auto it2 = view2.begin();
|
|
35
35
|
const auto n1 = sketch1.get_n();
|
|
36
36
|
const auto n2 = sketch2.get_n();
|
|
37
37
|
double delta = 0;
|
|
38
38
|
while (it1 != view1.end() && it2 != view2.end()) {
|
|
39
|
-
const double norm_cum_wt1 = static_cast<double>(
|
|
40
|
-
const double norm_cum_wt2 = static_cast<double>(
|
|
39
|
+
const double norm_cum_wt1 = static_cast<double>(it1.get_cumulative_weight(false)) / n1;
|
|
40
|
+
const double norm_cum_wt2 = static_cast<double>(it2.get_cumulative_weight(false)) / n2;
|
|
41
41
|
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
|
|
42
42
|
if (comparator((*it1).first, (*it2).first)) {
|
|
43
43
|
++it1;
|
|
@@ -48,8 +48,8 @@ double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
|
|
|
48
48
|
++it2;
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
|
-
const double norm_cum_wt1 = it1 == view1.end() ? 1 : static_cast<double>(
|
|
52
|
-
const double norm_cum_wt2 = it2 == view2.end() ? 1 : static_cast<double>(
|
|
51
|
+
const double norm_cum_wt1 = it1 == view1.end() ? 1 : static_cast<double>(it1.get_cumulative_weight(false)) / n1;
|
|
52
|
+
const double norm_cum_wt2 = it2 == view2.end() ? 1 : static_cast<double>(it2.get_cumulative_weight(false)) / n2;
|
|
53
53
|
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
|
|
54
54
|
return delta;
|
|
55
55
|
}
|
|
@@ -17,10 +17,13 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
#ifndef
|
|
21
|
-
#define
|
|
20
|
+
#ifndef QUANTILES_SORTED_VIEW_HPP_
|
|
21
|
+
#define QUANTILES_SORTED_VIEW_HPP_
|
|
22
22
|
|
|
23
23
|
#include <functional>
|
|
24
|
+
#include <cmath>
|
|
25
|
+
|
|
26
|
+
#include "common_defs.hpp"
|
|
24
27
|
|
|
25
28
|
namespace datasketches {
|
|
26
29
|
|
|
@@ -29,18 +32,17 @@ template<
|
|
|
29
32
|
typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
|
|
30
33
|
typename Allocator
|
|
31
34
|
>
|
|
32
|
-
class
|
|
35
|
+
class quantiles_sorted_view {
|
|
33
36
|
public:
|
|
34
37
|
using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
|
|
35
38
|
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
|
|
36
39
|
using Container = std::vector<Entry, AllocEntry>;
|
|
37
40
|
|
|
38
|
-
|
|
41
|
+
quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator);
|
|
39
42
|
|
|
40
43
|
template<typename Iterator>
|
|
41
44
|
void add(Iterator begin, Iterator end, uint64_t weight);
|
|
42
45
|
|
|
43
|
-
template<bool inclusive>
|
|
44
46
|
void convert_to_cummulative();
|
|
45
47
|
|
|
46
48
|
class const_iterator;
|
|
@@ -49,18 +51,29 @@ public:
|
|
|
49
51
|
|
|
50
52
|
size_t size() const;
|
|
51
53
|
|
|
52
|
-
|
|
54
|
+
double get_rank(const T& item, bool inclusive = true) const;
|
|
55
|
+
|
|
53
56
|
using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
|
|
54
|
-
quantile_return_type get_quantile(double rank) const;
|
|
57
|
+
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
|
|
58
|
+
|
|
59
|
+
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
|
|
60
|
+
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
|
61
|
+
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
|
55
62
|
|
|
56
63
|
private:
|
|
64
|
+
Comparator comparator_;
|
|
65
|
+
uint64_t total_weight_;
|
|
66
|
+
Container entries_;
|
|
67
|
+
|
|
57
68
|
static inline const T& deref_helper(const T* t) { return *t; }
|
|
58
69
|
static inline T deref_helper(T t) { return t; }
|
|
59
70
|
|
|
60
71
|
struct compare_pairs_by_first {
|
|
72
|
+
explicit compare_pairs_by_first(const Comparator& comparator): comparator_(comparator) {}
|
|
61
73
|
bool operator()(const Entry& a, const Entry& b) const {
|
|
62
|
-
return
|
|
74
|
+
return comparator_(deref_helper(a.first), deref_helper(b.first));
|
|
63
75
|
}
|
|
76
|
+
Comparator comparator_;
|
|
64
77
|
};
|
|
65
78
|
|
|
66
79
|
struct compare_pairs_by_second {
|
|
@@ -81,41 +94,63 @@ private:
|
|
|
81
94
|
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
82
95
|
static inline Entry make_dummy_entry(uint64_t weight) { return Entry(nullptr, weight); }
|
|
83
96
|
|
|
84
|
-
|
|
85
|
-
|
|
97
|
+
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
|
98
|
+
static inline void check_split_points(const T* items, uint32_t size) {
|
|
99
|
+
for (uint32_t i = 0; i < size ; i++) {
|
|
100
|
+
if (std::isnan(items[i])) {
|
|
101
|
+
throw std::invalid_argument("Values must not be NaN");
|
|
102
|
+
}
|
|
103
|
+
if ((i < (size - 1)) && !(Comparator()(items[i], items[i + 1]))) {
|
|
104
|
+
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
|
110
|
+
static inline void check_split_points(const T* items, uint32_t size) {
|
|
111
|
+
for (uint32_t i = 0; i < size ; i++) {
|
|
112
|
+
if ((i < (size - 1)) && !(Comparator()(items[i], items[i + 1]))) {
|
|
113
|
+
throw std::invalid_argument("Items must be unique and monotonically increasing");
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
86
117
|
};
|
|
87
118
|
|
|
88
119
|
template<typename T, typename C, typename A>
|
|
89
|
-
class
|
|
120
|
+
class quantiles_sorted_view<T, C, A>::const_iterator: public quantiles_sorted_view<T, C, A>::Container::const_iterator {
|
|
90
121
|
public:
|
|
91
|
-
using Base = typename
|
|
122
|
+
using Base = typename quantiles_sorted_view<T, C, A>::Container::const_iterator;
|
|
92
123
|
using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
|
|
93
124
|
|
|
94
|
-
const_iterator(const Base& it): Base(it) {}
|
|
125
|
+
const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
|
|
95
126
|
|
|
96
127
|
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
97
|
-
value_type operator*() const { return Base::operator*(); }
|
|
128
|
+
const value_type operator*() const { return Base::operator*(); }
|
|
98
129
|
|
|
99
130
|
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
100
|
-
value_type operator*() const { return value_type(*(Base::operator*().first), Base::operator*().second); }
|
|
101
|
-
|
|
102
|
-
class return_value_holder {
|
|
103
|
-
public:
|
|
104
|
-
return_value_holder(value_type value): value_(value) {}
|
|
105
|
-
const value_type* operator->() const { return &value_; }
|
|
106
|
-
private:
|
|
107
|
-
value_type value_;
|
|
108
|
-
};
|
|
131
|
+
const value_type operator*() const { return value_type(*(Base::operator*().first), Base::operator*().second); }
|
|
109
132
|
|
|
110
133
|
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
111
134
|
const value_type* operator->() const { return Base::operator->(); }
|
|
112
135
|
|
|
113
136
|
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
114
|
-
return_value_holder operator->() const { return **this; }
|
|
137
|
+
const return_value_holder<value_type> operator->() const { return **this; }
|
|
138
|
+
|
|
139
|
+
uint64_t get_weight() const {
|
|
140
|
+
if (*this == begin) return Base::operator*().second;
|
|
141
|
+
return Base::operator*().second - (*this - 1).operator*().second;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
uint64_t get_cumulative_weight(bool inclusive = true) const {
|
|
145
|
+
return inclusive ? Base::operator*().second : Base::operator*().second - get_weight();
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
private:
|
|
149
|
+
Base begin;
|
|
115
150
|
};
|
|
116
151
|
|
|
117
152
|
} /* namespace datasketches */
|
|
118
153
|
|
|
119
|
-
#include "
|
|
154
|
+
#include "quantiles_sorted_view_impl.hpp"
|
|
120
155
|
|
|
121
156
|
#endif
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef QUANTILES_SORTED_VIEW_IMPL_HPP_
|
|
21
|
+
#define QUANTILES_SORTED_VIEW_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <algorithm>
|
|
24
|
+
#include <stdexcept>
|
|
25
|
+
#include <cmath>
|
|
26
|
+
|
|
27
|
+
namespace datasketches {
|
|
28
|
+
|
|
29
|
+
template<typename T, typename C, typename A>
|
|
30
|
+
quantiles_sorted_view<T, C, A>::quantiles_sorted_view(uint32_t num, const C& comparator, const A& allocator):
|
|
31
|
+
comparator_(comparator),
|
|
32
|
+
total_weight_(0),
|
|
33
|
+
entries_(allocator)
|
|
34
|
+
{
|
|
35
|
+
entries_.reserve(num);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
template<typename T, typename C, typename A>
|
|
39
|
+
template<typename Iterator>
|
|
40
|
+
void quantiles_sorted_view<T, C, A>::add(Iterator first, Iterator last, uint64_t weight) {
|
|
41
|
+
const size_t size_before = entries_.size();
|
|
42
|
+
for (auto it = first; it != last; ++it) entries_.push_back(Entry(ref_helper(*it), weight));
|
|
43
|
+
if (size_before > 0) {
|
|
44
|
+
Container tmp(entries_.get_allocator());
|
|
45
|
+
tmp.reserve(entries_.capacity());
|
|
46
|
+
std::merge(
|
|
47
|
+
entries_.begin(), entries_.begin() + size_before,
|
|
48
|
+
entries_.begin() + size_before, entries_.end(),
|
|
49
|
+
std::back_inserter(tmp), compare_pairs_by_first(comparator_)
|
|
50
|
+
);
|
|
51
|
+
std::swap(tmp, entries_);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
template<typename T, typename C, typename A>
|
|
56
|
+
void quantiles_sorted_view<T, C, A>::convert_to_cummulative() {
|
|
57
|
+
for (auto& entry: entries_) {
|
|
58
|
+
total_weight_ += entry.second;
|
|
59
|
+
entry.second = total_weight_;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
template<typename T, typename C, typename A>
|
|
64
|
+
double quantiles_sorted_view<T, C, A>::get_rank(const T& item, bool inclusive) const {
|
|
65
|
+
if (entries_.empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
|
66
|
+
auto it = inclusive ?
|
|
67
|
+
std::upper_bound(entries_.begin(), entries_.end(), Entry(ref_helper(item), 0), compare_pairs_by_first(comparator_))
|
|
68
|
+
: std::lower_bound(entries_.begin(), entries_.end(), Entry(ref_helper(item), 0), compare_pairs_by_first(comparator_));
|
|
69
|
+
// we need item just before
|
|
70
|
+
if (it == entries_.begin()) return 0;
|
|
71
|
+
--it;
|
|
72
|
+
return static_cast<double>(it->second) / total_weight_;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
template<typename T, typename C, typename A>
|
|
76
|
+
auto quantiles_sorted_view<T, C, A>::get_quantile(double rank, bool inclusive) const -> quantile_return_type {
|
|
77
|
+
if (entries_.empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
|
78
|
+
uint64_t weight = inclusive ? std::ceil(rank * total_weight_) : rank * total_weight_;
|
|
79
|
+
auto it = inclusive ?
|
|
80
|
+
std::lower_bound(entries_.begin(), entries_.end(), make_dummy_entry<T>(weight), compare_pairs_by_second())
|
|
81
|
+
: std::upper_bound(entries_.begin(), entries_.end(), make_dummy_entry<T>(weight), compare_pairs_by_second());
|
|
82
|
+
if (it == entries_.end()) return deref_helper(entries_[entries_.size() - 1].first);
|
|
83
|
+
return deref_helper(it->first);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
template<typename T, typename C, typename A>
|
|
87
|
+
auto quantiles_sorted_view<T, C, A>::get_CDF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
|
|
88
|
+
if (entries_.empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
|
89
|
+
vector_double buckets(entries_.get_allocator());
|
|
90
|
+
if (entries_.size() == 0) return buckets;
|
|
91
|
+
check_split_points(split_points, size);
|
|
92
|
+
buckets.reserve(size + 1);
|
|
93
|
+
for (uint32_t i = 0; i < size; ++i) buckets.push_back(get_rank(split_points[i], inclusive));
|
|
94
|
+
buckets.push_back(1);
|
|
95
|
+
return buckets;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
template<typename T, typename C, typename A>
|
|
99
|
+
auto quantiles_sorted_view<T, C, A>::get_PMF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
|
|
100
|
+
auto buckets = get_CDF(split_points, size, inclusive);
|
|
101
|
+
if (buckets.size() == 0) return buckets;
|
|
102
|
+
for (uint32_t i = size; i > 0; --i) {
|
|
103
|
+
buckets[i] -= buckets[i - 1];
|
|
104
|
+
}
|
|
105
|
+
return buckets;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
template<typename T, typename C, typename A>
|
|
109
|
+
auto quantiles_sorted_view<T, C, A>::begin() const -> const_iterator {
|
|
110
|
+
return const_iterator(entries_.begin(), entries_.begin());
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
template<typename T, typename C, typename A>
|
|
114
|
+
auto quantiles_sorted_view<T, C, A>::end() const -> const_iterator {
|
|
115
|
+
return const_iterator(entries_.end(), entries_.begin());
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
template<typename T, typename C, typename A>
|
|
119
|
+
size_t quantiles_sorted_view<T, C, A>::size() const {
|
|
120
|
+
return entries_.size();
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
} /* namespace datasketches */
|
|
124
|
+
|
|
125
|
+
#endif
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef _VERSION_HPP_
|
|
21
|
+
#define _VERSION_HPP_
|
|
22
|
+
|
|
23
|
+
namespace datasketches {
|
|
24
|
+
|
|
25
|
+
// the configured options and settings for DataSketches
|
|
26
|
+
constexpr int VERSION_MAJOR {@DataSketches_VERSION_MAJOR@};
|
|
27
|
+
constexpr int VERSION_MINOR {@DataSketches_VERSION_MINOR@};
|
|
28
|
+
constexpr int VERSION_PATCH {@DataSketches_VERSION_PATCH@};
|
|
29
|
+
constexpr int VERSION_TWEAK {@DataSketches_VERSION_TWEAK@};
|
|
30
|
+
|
|
31
|
+
constexpr auto VERSION_STR = "@DataSketches_VERSION@";
|
|
32
|
+
constexpr auto SOURCE_URL = "https://github.com/apache/datasketches-cpp";
|
|
33
|
+
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
#endif // _VERSION_HPP_
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
# and an integration test using the other parts of the library.
|
|
20
20
|
|
|
21
21
|
# common dependencies for tests
|
|
22
|
-
add_library(
|
|
22
|
+
add_library(common_test_lib OBJECT "")
|
|
23
23
|
|
|
24
24
|
include(FetchContent)
|
|
25
25
|
|
|
@@ -31,19 +31,19 @@ FetchContent_Declare(
|
|
|
31
31
|
|
|
32
32
|
FetchContent_MakeAvailable(Catch2)
|
|
33
33
|
|
|
34
|
-
target_link_libraries(
|
|
34
|
+
target_link_libraries(common_test_lib PUBLIC Catch2::Catch2)
|
|
35
35
|
|
|
36
|
-
set_target_properties(
|
|
36
|
+
set_target_properties(common_test_lib PROPERTIES
|
|
37
37
|
CXX_STANDARD 11
|
|
38
38
|
CXX_STANDARD_REQUIRED YES
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
target_include_directories(
|
|
41
|
+
target_include_directories(common_test_lib
|
|
42
42
|
INTERFACE
|
|
43
43
|
${CMAKE_CURRENT_SOURCE_DIR}
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
target_sources(
|
|
46
|
+
target_sources(common_test_lib
|
|
47
47
|
INTERFACE
|
|
48
48
|
${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.hpp
|
|
49
49
|
${CMAKE_CURRENT_SOURCE_DIR}/test_type.hpp
|
|
@@ -52,10 +52,29 @@ target_sources(common_test
|
|
|
52
52
|
${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
|
|
53
53
|
)
|
|
54
54
|
|
|
55
|
+
add_executable(common_test)
|
|
56
|
+
|
|
57
|
+
target_link_libraries(common_test common common_test_lib)
|
|
58
|
+
|
|
59
|
+
set_target_properties(common_test PROPERTIES
|
|
60
|
+
CXX_STANDARD 11
|
|
61
|
+
CXX_STANDARD_REQUIRED YES
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
add_test(
|
|
65
|
+
NAME common_test
|
|
66
|
+
COMMAND common_test
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
target_sources(common_test
|
|
70
|
+
PRIVATE
|
|
71
|
+
quantiles_sorted_view_test.cpp
|
|
72
|
+
)
|
|
73
|
+
|
|
55
74
|
# now the integration test part
|
|
56
75
|
add_executable(integration_test)
|
|
57
76
|
|
|
58
|
-
target_link_libraries(integration_test cpc fi hll kll req sampling theta tuple
|
|
77
|
+
target_link_libraries(integration_test cpc fi hll kll req sampling theta tuple common_test_lib)
|
|
59
78
|
|
|
60
79
|
set_target_properties(integration_test PROPERTIES
|
|
61
80
|
CXX_STANDARD 11
|