datasketches 0.2.5 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/LICENSE +4 -6
- data/NOTICE +6 -5
- data/ext/datasketches/kll_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/LICENSE +4 -6
- data/vendor/datasketches-cpp/MANIFEST.in +0 -2
- data/vendor/datasketches-cpp/NOTICE +6 -5
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -2
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +13 -2
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +22 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +2 -1
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +22 -9
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +47 -9
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +72 -7
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +1 -1
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +0 -1
- data/vendor/datasketches-cpp/python/README.md +6 -9
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +1 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +15 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +78 -14
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +3 -3
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +64 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +27 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +11 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +29 -2
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +70 -4
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/setup.py +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +25 -31
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -1
- metadata +2 -4
- data/vendor/datasketches-cpp/common/test/catch.hpp +0 -17618
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +0 -29
@@ -26,6 +26,7 @@
|
|
26
26
|
#include <stdexcept>
|
27
27
|
|
28
28
|
#include "conditional_forward.hpp"
|
29
|
+
#include "count_zeros.hpp"
|
29
30
|
#include "memory_operations.hpp"
|
30
31
|
#include "kll_helper.hpp"
|
31
32
|
|
@@ -69,7 +70,7 @@ max_value_(nullptr),
|
|
69
70
|
is_level_zero_sorted_(other.is_level_zero_sorted_)
|
70
71
|
{
|
71
72
|
items_ = allocator_.allocate(items_size_);
|
72
|
-
|
73
|
+
for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
|
73
74
|
if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
|
74
75
|
if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
|
75
76
|
}
|
@@ -147,6 +148,33 @@ kll_sketch<T, C, S, A>::~kll_sketch() {
|
|
147
148
|
}
|
148
149
|
}
|
149
150
|
|
151
|
+
template<typename T, typename C, typename S, typename A>
|
152
|
+
template<typename TT, typename CC, typename SS, typename AA>
|
153
|
+
kll_sketch<T, C, S, A>::kll_sketch(const kll_sketch<TT, CC, SS, AA>& other, const A& allocator):
|
154
|
+
allocator_(allocator),
|
155
|
+
k_(other.k_),
|
156
|
+
m_(other.m_),
|
157
|
+
min_k_(other.min_k_),
|
158
|
+
n_(other.n_),
|
159
|
+
num_levels_(other.num_levels_),
|
160
|
+
levels_(other.levels_, allocator_),
|
161
|
+
items_(nullptr),
|
162
|
+
items_size_(other.items_size_),
|
163
|
+
min_value_(nullptr),
|
164
|
+
max_value_(nullptr),
|
165
|
+
is_level_zero_sorted_(other.is_level_zero_sorted_)
|
166
|
+
{
|
167
|
+
static_assert(
|
168
|
+
std::is_constructible<T, TT>::value,
|
169
|
+
"Type converting constructor requires new type to be constructible from existing type"
|
170
|
+
);
|
171
|
+
items_ = allocator_.allocate(items_size_);
|
172
|
+
for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]);
|
173
|
+
if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
|
174
|
+
if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
|
175
|
+
check_sorting();
|
176
|
+
}
|
177
|
+
|
150
178
|
template<typename T, typename C, typename S, typename A>
|
151
179
|
template<typename FwdT>
|
152
180
|
void kll_sketch<T, C, S, A>::update(FwdT&& value) {
|
@@ -305,8 +333,8 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
|
|
305
333
|
uint64_t weight = 1;
|
306
334
|
uint64_t total = 0;
|
307
335
|
while (level < num_levels_) {
|
308
|
-
const auto from_index
|
309
|
-
const auto to_index
|
336
|
+
const auto from_index = levels_[level];
|
337
|
+
const auto to_index = levels_[level + 1]; // exclusive
|
310
338
|
for (uint32_t i = from_index; i < to_index; i++) {
|
311
339
|
if (inclusive ? !C()(value, items_[i]) : C()(items_[i], value)) {
|
312
340
|
total += weight;
|
@@ -694,7 +722,7 @@ void kll_sketch<T, C, S, A>::compress_while_updating(void) {
|
|
694
722
|
// level zero might not be sorted, so we must sort it if we wish to compact it
|
695
723
|
// sort_level_zero() is not used here because of the adjustment for odd number of items
|
696
724
|
if ((level == 0) && !is_level_zero_sorted_) {
|
697
|
-
std::sort(
|
725
|
+
std::sort(items_ + adj_beg, items_ + adj_beg + adj_pop, C());
|
698
726
|
}
|
699
727
|
if (pop_above == 0) {
|
700
728
|
kll_helper::randomly_halve_up(items_, adj_beg, adj_pop);
|
@@ -717,7 +745,7 @@ void kll_sketch<T, C, S, A>::compress_while_updating(void) {
|
|
717
745
|
// so that the freed-up space can be used by level zero
|
718
746
|
if (level > 0) {
|
719
747
|
const uint32_t amount = raw_beg - levels_[0];
|
720
|
-
std::move_backward(
|
748
|
+
std::move_backward(items_ + levels_[0], items_ + levels_[0] + amount, items_ + levels_[0] + half_adj_pop + amount);
|
721
749
|
for (uint8_t lvl = 0; lvl < level; lvl++) levels_[lvl] += half_adj_pop;
|
722
750
|
}
|
723
751
|
for (uint32_t i = 0; i < half_adj_pop; i++) items_[i + destroy_beg].~T();
|
@@ -775,22 +803,32 @@ void kll_sketch<T, C, S, A>::add_empty_top_level_to_completely_full_sketch() {
|
|
775
803
|
template<typename T, typename C, typename S, typename A>
|
776
804
|
void kll_sketch<T, C, S, A>::sort_level_zero() {
|
777
805
|
if (!is_level_zero_sorted_) {
|
778
|
-
std::sort(
|
806
|
+
std::sort(items_ + levels_[0], items_ + levels_[1], C());
|
779
807
|
is_level_zero_sorted_ = true;
|
780
808
|
}
|
781
809
|
}
|
782
810
|
|
811
|
+
template<typename T, typename C, typename S, typename A>
|
812
|
+
void kll_sketch<T, C, S, A>::check_sorting() const {
|
813
|
+
// not checking level 0
|
814
|
+
for (uint8_t level = 1; level < num_levels_; ++level) {
|
815
|
+
const auto from = items_ + levels_[level];
|
816
|
+
const auto to = items_ + levels_[level + 1];
|
817
|
+
if (!std::is_sorted(from, to, C())) {
|
818
|
+
throw std::logic_error("levels must be sorted");
|
819
|
+
}
|
820
|
+
}
|
821
|
+
}
|
822
|
+
|
783
823
|
template<typename T, typename C, typename S, typename A>
|
784
824
|
template<bool inclusive>
|
785
825
|
quantile_sketch_sorted_view<T, C, A> kll_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
|
786
826
|
const_cast<kll_sketch*>(this)->sort_level_zero(); // allow this side effect
|
787
827
|
quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
|
788
|
-
uint8_t level = 0;
|
789
|
-
while (level < num_levels_) {
|
828
|
+
for (uint8_t level = 0; level < num_levels_; ++level) {
|
790
829
|
const auto from = items_ + levels_[level];
|
791
830
|
const auto to = items_ + levels_[level + 1]; // exclusive
|
792
831
|
view.add(from, to, 1 << level);
|
793
|
-
++level;
|
794
832
|
}
|
795
833
|
if (cumulative) view.template convert_to_cummulative<inclusive>();
|
796
834
|
return view;
|
@@ -17,7 +17,7 @@
|
|
17
17
|
* under the License.
|
18
18
|
*/
|
19
19
|
|
20
|
-
#include <catch.hpp>
|
20
|
+
#include <catch2/catch.hpp>
|
21
21
|
#include <cmath>
|
22
22
|
#include <cstring>
|
23
23
|
#include <sstream>
|
@@ -39,9 +39,9 @@ static std::string testBinaryInputPath = "test/";
|
|
39
39
|
#endif
|
40
40
|
|
41
41
|
// typical usage would be just kll_sketch<float> or kll_sketch<std::string>, but here we use test_allocator
|
42
|
-
|
42
|
+
using kll_float_sketch = kll_sketch<float, std::less<float>, serde<float>, test_allocator<float>>;
|
43
43
|
// let std::string use the default allocator for simplicity, otherwise we need to define "less" and "serde"
|
44
|
-
|
44
|
+
using kll_string_sketch = kll_sketch<std::string, std::less<std::string>, serde<std::string>, test_allocator<std::string>>;
|
45
45
|
|
46
46
|
TEST_CASE("kll sketch", "[kll_sketch]") {
|
47
47
|
|
@@ -75,7 +75,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
75
75
|
(void) it; // to suppress "unused" warning
|
76
76
|
FAIL("should be no iterations over an empty sketch");
|
77
77
|
}
|
78
|
-
}
|
78
|
+
}
|
79
79
|
|
80
80
|
SECTION("get bad quantile") {
|
81
81
|
kll_float_sketch sketch(200, 0);
|
@@ -835,10 +835,75 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
835
835
|
REQUIRE((*it).second == 3);
|
836
836
|
}
|
837
837
|
}
|
838
|
-
|
839
|
-
|
840
|
-
|
838
|
+
|
839
|
+
SECTION("type conversion: empty") {
|
840
|
+
kll_sketch<double> kll_double;
|
841
|
+
kll_sketch<float> kll_float(kll_double);
|
842
|
+
REQUIRE(kll_float.is_empty());
|
843
|
+
REQUIRE(kll_float.get_k() == kll_double.get_k());
|
844
|
+
REQUIRE(kll_float.get_n() == 0);
|
845
|
+
REQUIRE(kll_float.get_num_retained() == 0);
|
846
|
+
}
|
847
|
+
|
848
|
+
SECTION("type conversion: over k") {
|
849
|
+
kll_sketch<double> kll_double;
|
850
|
+
for (int i = 0; i < 1000; ++i) kll_double.update(static_cast<double>(i));
|
851
|
+
kll_sketch<float> kll_float(kll_double);
|
852
|
+
REQUIRE(!kll_float.is_empty());
|
853
|
+
REQUIRE(kll_float.get_k() == kll_double.get_k());
|
854
|
+
REQUIRE(kll_float.get_n() == kll_double.get_n());
|
855
|
+
REQUIRE(kll_float.get_num_retained() == kll_double.get_num_retained());
|
856
|
+
|
857
|
+
auto sv_float = kll_float.get_sorted_view(false);
|
858
|
+
auto sv_double = kll_double.get_sorted_view(false);
|
859
|
+
auto sv_float_it = sv_float.begin();
|
860
|
+
auto sv_double_it = sv_double.begin();
|
861
|
+
while (sv_float_it != sv_float.end()) {
|
862
|
+
REQUIRE(sv_double_it != sv_double.end());
|
863
|
+
auto float_pair = *sv_float_it;
|
864
|
+
auto double_pair = *sv_double_it;
|
865
|
+
REQUIRE(float_pair.first == Approx(double_pair.first).margin(0.01));
|
866
|
+
REQUIRE(float_pair.second == double_pair.second);
|
867
|
+
++sv_float_it;
|
868
|
+
++sv_double_it;
|
869
|
+
}
|
870
|
+
REQUIRE(sv_double_it == sv_double.end());
|
871
|
+
}
|
872
|
+
|
873
|
+
class A {
|
874
|
+
int val;
|
875
|
+
public:
|
876
|
+
A(int val): val(val) {}
|
877
|
+
int get_val() const { return val; }
|
878
|
+
};
|
879
|
+
|
880
|
+
struct less_A {
|
881
|
+
bool operator()(const A& a1, const A& a2) const { return a1.get_val() < a2.get_val(); }
|
882
|
+
};
|
883
|
+
|
884
|
+
class B {
|
885
|
+
int val;
|
886
|
+
public:
|
887
|
+
explicit B(const A& a): val(a.get_val()) {}
|
888
|
+
int get_val() const { return val; }
|
889
|
+
};
|
890
|
+
|
891
|
+
struct less_B {
|
892
|
+
bool operator()(const B& b1, const B& b2) const { return b1.get_val() < b2.get_val(); }
|
893
|
+
};
|
894
|
+
|
895
|
+
SECTION("type conversion: custom types") {
|
896
|
+
kll_sketch<A, less_A> sa;
|
897
|
+
sa.update(1);
|
898
|
+
sa.update(2);
|
899
|
+
sa.update(3);
|
900
|
+
|
901
|
+
kll_sketch<B, less_B> sb(sa);
|
902
|
+
REQUIRE(sb.get_n() == 3);
|
841
903
|
}
|
904
|
+
|
905
|
+
// cleanup
|
906
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
842
907
|
}
|
843
908
|
|
844
909
|
} /* namespace datasketches */
|
@@ -12,16 +12,18 @@ This package provides a variety of sketches as described below. Wherever a speci
|
|
12
12
|
|
13
13
|
## Building and Installation
|
14
14
|
|
15
|
-
Once cloned, the library can be installed by running `
|
15
|
+
Once cloned, the library can be installed by running `python3 -m pip install .` in the project root directory -- not the python subdirectory -- which will also install the necessary dependencies, namely numpy and [pybind11[global]](https://github.com/pybind/pybind11).
|
16
16
|
|
17
|
-
If you prefer to call the `setup.py` build script directly, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
|
17
|
+
If you prefer to call the `setup.py` build script directly, which is discoraged, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
|
18
18
|
|
19
|
-
The library is also available from PyPI via `
|
19
|
+
The library is also available from PyPI via `python3 -m pip install datasketches`.
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
23
|
Having installed the library, loading the Apache Datasketches Library in Python is simple: `import datasketches`.
|
24
24
|
|
25
|
+
The unit tests are mostly structured in a tutorial style and can be used as a reference example for how to feed data into and query the different types of sketches.
|
26
|
+
|
25
27
|
## Available Sketch Classes
|
26
28
|
|
27
29
|
- KLL (Absolute Error Quantiles)
|
@@ -74,12 +76,7 @@ The only developer-specific instructions relate to running unit tests.
|
|
74
76
|
|
75
77
|
### Unit tests
|
76
78
|
|
77
|
-
The Python unit tests are run
|
78
|
-
|
79
|
-
```bash
|
80
|
-
python -m pip install --upgrade tox
|
81
|
-
tox
|
82
|
-
```
|
79
|
+
The Python unit tests are run via `tox`, with no arguments, from the project root directory -- not the python subdirectory. Tox creates a temporary virtual environment in which to build and run the unit tests. In the event you are missing the necessary pacakge, tox may be installed with `python3 -m pip install --upgrade tox`.
|
83
80
|
|
84
81
|
## License
|
85
82
|
|
@@ -1,3 +1,3 @@
|
|
1
1
|
@echo off
|
2
2
|
:: Takes path to the Python interpreter and returns the path to pybind11
|
3
|
-
%1 -
|
3
|
+
%1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())"
|
@@ -151,6 +151,7 @@ template <typename T,
|
|
151
151
|
class quantiles_sketch {
|
152
152
|
public:
|
153
153
|
using value_type = T;
|
154
|
+
using allocator_type = Allocator;
|
154
155
|
using comparator = Comparator;
|
155
156
|
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
|
156
157
|
|
@@ -161,6 +162,14 @@ public:
|
|
161
162
|
quantiles_sketch& operator=(const quantiles_sketch& other);
|
162
163
|
quantiles_sketch& operator=(quantiles_sketch&& other) noexcept;
|
163
164
|
|
165
|
+
/**
|
166
|
+
* @brief Type converting constructor
|
167
|
+
* @param other quantiles sketch of a different type
|
168
|
+
* @param allocator instance of an Allocator
|
169
|
+
*/
|
170
|
+
template<typename From, typename FC, typename FA>
|
171
|
+
explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other, const Allocator& allocator = Allocator());
|
172
|
+
|
164
173
|
/**
|
165
174
|
* Updates this sketch with the given data item.
|
166
175
|
* @param value an item from a stream of items
|
@@ -227,6 +236,12 @@ public:
|
|
227
236
|
*/
|
228
237
|
Comparator get_comparator() const;
|
229
238
|
|
239
|
+
/**
|
240
|
+
* Returns the allocator for this sketch.
|
241
|
+
* @return allocator
|
242
|
+
*/
|
243
|
+
allocator_type get_allocator() const;
|
244
|
+
|
230
245
|
/**
|
231
246
|
* Returns an approximation to the value of the data item
|
232
247
|
* that would be preceded by the given fraction of a hypothetical sorted
|
@@ -138,6 +138,65 @@ is_sorted_(is_sorted)
|
|
138
138
|
throw std::logic_error("Item count does not match value computed from k, n");
|
139
139
|
}
|
140
140
|
|
141
|
+
template<typename T, typename C, typename A>
|
142
|
+
template<typename From, typename FC, typename FA>
|
143
|
+
quantiles_sketch<T, C, A>::quantiles_sketch(const quantiles_sketch<From, FC, FA>& other, const A& allocator) :
|
144
|
+
allocator_(allocator),
|
145
|
+
k_(other.get_k()),
|
146
|
+
n_(other.get_n()),
|
147
|
+
bit_pattern_(compute_bit_pattern(other.get_k(), other.get_n())),
|
148
|
+
base_buffer_(allocator),
|
149
|
+
levels_(allocator),
|
150
|
+
min_value_(nullptr),
|
151
|
+
max_value_(nullptr),
|
152
|
+
is_sorted_(false)
|
153
|
+
{
|
154
|
+
static_assert(std::is_constructible<T, From>::value,
|
155
|
+
"Type converting constructor requires new type to be constructible from existing type");
|
156
|
+
|
157
|
+
base_buffer_.reserve(2 * std::min(quantiles_constants::MIN_K, k_));
|
158
|
+
|
159
|
+
if (!other.is_empty()) {
|
160
|
+
min_value_ = new (allocator_.allocate(1)) T(other.get_min_value());
|
161
|
+
max_value_ = new (allocator_.allocate(1)) T(other.get_max_value());
|
162
|
+
|
163
|
+
// reserve space in levels
|
164
|
+
const uint8_t num_levels = compute_levels_needed(k_, n_);
|
165
|
+
levels_.reserve(num_levels);
|
166
|
+
for (int i = 0; i < num_levels; ++i) {
|
167
|
+
Level level(allocator);
|
168
|
+
level.reserve(k_);
|
169
|
+
levels_.push_back(std::move(level));
|
170
|
+
}
|
171
|
+
|
172
|
+
// iterate through points, assigning to the correct level as needed
|
173
|
+
for (auto pair : other) {
|
174
|
+
const uint64_t wt = pair.second;
|
175
|
+
if (wt == 1) {
|
176
|
+
base_buffer_.push_back(T(pair.first));
|
177
|
+
// resize where needed as if adding points via update()
|
178
|
+
if (base_buffer_.size() + 1 > base_buffer_.capacity()) {
|
179
|
+
const size_t new_size = std::max(std::min(static_cast<size_t>(2 * k_), 2 * base_buffer_.size()), static_cast<size_t>(1));
|
180
|
+
base_buffer_.reserve(new_size);
|
181
|
+
}
|
182
|
+
}
|
183
|
+
else {
|
184
|
+
const uint8_t idx = count_trailing_zeros_in_u64(pair.second) - 1;
|
185
|
+
levels_[idx].push_back(T(pair.first));
|
186
|
+
}
|
187
|
+
}
|
188
|
+
|
189
|
+
// validate that ordering within each level is preserved
|
190
|
+
// base_buffer_ can be considered unsorted for this purpose
|
191
|
+
for (int i = 0; i < num_levels; ++i) {
|
192
|
+
if (!std::is_sorted(levels_[i].begin(), levels_[i].end(), C())) {
|
193
|
+
throw std::logic_error("Copy construction across types produces invalid sorting");
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
199
|
+
|
141
200
|
template<typename T, typename C, typename A>
|
142
201
|
quantiles_sketch<T, C, A>::~quantiles_sketch() {
|
143
202
|
if (min_value_ != nullptr) {
|
@@ -238,7 +297,7 @@ void quantiles_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& serde)
|
|
238
297
|
);
|
239
298
|
write(os, flags_byte);
|
240
299
|
write(os, k_);
|
241
|
-
uint16_t unused = 0;
|
300
|
+
const uint16_t unused = 0;
|
242
301
|
write(os, unused);
|
243
302
|
|
244
303
|
if (!is_empty()) {
|
@@ -624,6 +683,11 @@ C quantiles_sketch<T, C, A>::get_comparator() const {
|
|
624
683
|
return C();
|
625
684
|
}
|
626
685
|
|
686
|
+
template<typename T, typename C, typename A>
|
687
|
+
A quantiles_sketch<T, C, A>::get_allocator() const {
|
688
|
+
return allocator_;
|
689
|
+
}
|
690
|
+
|
627
691
|
// implementation for fixed-size arithmetic types (integral and floating point)
|
628
692
|
template<typename T, typename C, typename A>
|
629
693
|
template<typename SerDe, typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
@@ -783,9 +847,9 @@ auto quantiles_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size) co
|
|
783
847
|
|
784
848
|
template<typename T, typename C, typename A>
|
785
849
|
uint32_t quantiles_sketch<T, C, A>::compute_retained_items(const uint16_t k, const uint64_t n) {
|
786
|
-
uint32_t bb_count = compute_base_buffer_items(k, n);
|
787
|
-
uint64_t bit_pattern = compute_bit_pattern(k, n);
|
788
|
-
uint32_t valid_levels = compute_valid_levels(bit_pattern);
|
850
|
+
const uint32_t bb_count = compute_base_buffer_items(k, n);
|
851
|
+
const uint64_t bit_pattern = compute_bit_pattern(k, n);
|
852
|
+
const uint32_t valid_levels = compute_valid_levels(bit_pattern);
|
789
853
|
return bb_count + (k * valid_levels);
|
790
854
|
}
|
791
855
|
|
@@ -843,11 +907,11 @@ void quantiles_sketch<T, C, A>::check_family_id(uint8_t family_id) {
|
|
843
907
|
|
844
908
|
template<typename T, typename C, typename A>
|
845
909
|
void quantiles_sketch<T, C, A>::check_header_validity(uint8_t preamble_longs, uint8_t flags_byte, uint8_t serial_version) {
|
846
|
-
bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
847
|
-
bool compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
|
910
|
+
const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
911
|
+
const bool compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
|
848
912
|
|
849
|
-
uint8_t sw = (compact ? 1 : 0) + (2 * (empty ? 1 : 0))
|
850
|
-
|
913
|
+
const uint8_t sw = (compact ? 1 : 0) + (2 * (empty ? 1 : 0))
|
914
|
+
+ (4 * (serial_version & 0xF)) + (32 * (preamble_longs & 0x3F));
|
851
915
|
bool valid = true;
|
852
916
|
|
853
917
|
switch (sw) { // exhaustive list and description of all valid cases
|
@@ -888,7 +952,7 @@ typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::en
|
|
888
952
|
|
889
953
|
template<typename T, typename C, typename A>
|
890
954
|
void quantiles_sketch<T, C, A>::grow_base_buffer() {
|
891
|
-
size_t new_size = std::max(std::min(static_cast<size_t>(2 * k_), 2 * base_buffer_.size()), static_cast<size_t>(1));
|
955
|
+
const size_t new_size = std::max(std::min(static_cast<size_t>(2 * k_), 2 * base_buffer_.size()), static_cast<size_t>(1));
|
892
956
|
base_buffer_.reserve(new_size);
|
893
957
|
}
|
894
958
|
|
@@ -912,7 +976,7 @@ void quantiles_sketch<T, C, A>::process_full_base_buffer() {
|
|
912
976
|
|
913
977
|
template<typename T, typename C, typename A>
|
914
978
|
bool quantiles_sketch<T, C, A>::grow_levels_if_needed() {
|
915
|
-
uint8_t levels_needed = compute_levels_needed(k_, n_);
|
979
|
+
const uint8_t levels_needed = compute_levels_needed(k_, n_);
|
916
980
|
if (levels_needed == 0)
|
917
981
|
return false; // don't need levels and might have small base buffer. Possible during merges.
|
918
982
|
|
@@ -992,7 +1056,7 @@ template<typename FwdV>
|
|
992
1056
|
void quantiles_sketch<T, C, A>::zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride) {
|
993
1057
|
// Random offset in range [0, stride)
|
994
1058
|
std::uniform_int_distribution<uint16_t> dist(0, stride - 1);
|
995
|
-
uint16_t rand_offset = dist(random_utils::rand);
|
1059
|
+
const uint16_t rand_offset = dist(random_utils::rand);
|
996
1060
|
|
997
1061
|
if ((buf_in.size() != stride * buf_out.capacity())
|
998
1062
|
|| (buf_out.size() > 0)) {
|
@@ -1000,7 +1064,7 @@ void quantiles_sketch<T, C, A>::zip_buffer_with_stride(FwdV&& buf_in, Level& buf
|
|
1000
1064
|
"stride*buf_out.capacity() and empty buf_out");
|
1001
1065
|
}
|
1002
1066
|
|
1003
|
-
size_t k = buf_out.capacity();
|
1067
|
+
const size_t k = buf_out.capacity();
|
1004
1068
|
for (uint16_t i = rand_offset, o = 0; o < k; i += stride, ++o) {
|
1005
1069
|
buf_out.push_back(conditional_forward<FwdV>(buf_in[i]));
|
1006
1070
|
}
|
@@ -1117,7 +1181,7 @@ void quantiles_sketch<T, C, A>::downsampling_merge(quantiles_sketch& tgt, FwdSk&
|
|
1117
1181
|
const uint16_t downsample_factor = src.get_k() / tgt.get_k();
|
1118
1182
|
const uint8_t lg_sample_factor = count_trailing_zeros_in_u32(downsample_factor);
|
1119
1183
|
|
1120
|
-
uint64_t new_n = src.get_n() + tgt.get_n();
|
1184
|
+
const uint64_t new_n = src.get_n() + tgt.get_n();
|
1121
1185
|
|
1122
1186
|
// move items from src's base buffer
|
1123
1187
|
for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
|
@@ -1125,7 +1189,7 @@ void quantiles_sketch<T, C, A>::downsampling_merge(quantiles_sketch& tgt, FwdSk&
|
|
1125
1189
|
}
|
1126
1190
|
|
1127
1191
|
// check (after moving raw items) if we need to extend levels array
|
1128
|
-
uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
|
1192
|
+
const uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
|
1129
1193
|
if (levels_needed > tgt.levels_.size()) {
|
1130
1194
|
tgt.levels_.reserve(levels_needed);
|
1131
1195
|
while (tgt.levels_.size() < levels_needed) {
|
@@ -17,7 +17,7 @@
|
|
17
17
|
* under the License.
|
18
18
|
*/
|
19
19
|
|
20
|
-
#include <catch.hpp>
|
20
|
+
#include <catch2/catch.hpp>
|
21
21
|
|
22
22
|
#include <random>
|
23
23
|
|
@@ -82,7 +82,7 @@ TEST_CASE("kolmogorov-smirnov slightly different distributions", "[quantiles_ske
|
|
82
82
|
const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
|
83
83
|
REQUIRE(delta == Approx(0.02).margin(0.01));
|
84
84
|
const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
|
85
|
-
|
85
|
+
|
86
86
|
REQUIRE_FALSE(delta > threshold);
|
87
87
|
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
|
88
88
|
}
|
@@ -102,7 +102,7 @@ TEST_CASE("kolmogorov-smirnov slightly different distributions high resolution",
|
|
102
102
|
const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
|
103
103
|
REQUIRE(delta == Approx(0.02).margin(0.01));
|
104
104
|
const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
|
105
|
-
|
105
|
+
|
106
106
|
REQUIRE(delta > threshold);
|
107
107
|
REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
|
108
108
|
}
|
@@ -17,7 +17,7 @@
|
|
17
17
|
* under the License.
|
18
18
|
*/
|
19
19
|
|
20
|
-
#include <catch.hpp>
|
20
|
+
#include <catch2/catch.hpp>
|
21
21
|
#include <cmath>
|
22
22
|
#include <sstream>
|
23
23
|
#include <fstream>
|
@@ -903,6 +903,69 @@ TEST_CASE("quantiles sketch", "[quantiles_sketch]") {
|
|
903
903
|
}
|
904
904
|
}
|
905
905
|
|
906
|
+
SECTION("Type converting copy constructor") {
|
907
|
+
const uint16_t k = 8;
|
908
|
+
const int n = 403;
|
909
|
+
quantiles_sketch<double> sk_double(k);
|
910
|
+
|
911
|
+
quantiles_sketch<float> sk_float(k, sk_double.get_allocator());
|
912
|
+
REQUIRE(sk_float.is_empty());
|
913
|
+
|
914
|
+
for (int i = 0; i < n; ++i) sk_double.update(i + .01);
|
915
|
+
|
916
|
+
quantiles_sketch<int> sk_int(sk_double);
|
917
|
+
REQUIRE(sk_double.get_n() == sk_int.get_n());
|
918
|
+
REQUIRE(sk_double.get_k() == sk_int.get_k());
|
919
|
+
REQUIRE(sk_double.get_num_retained() == sk_int.get_num_retained());
|
920
|
+
|
921
|
+
auto sv_double = sk_double.get_sorted_view(false);
|
922
|
+
std::vector<std::pair<double, uint64_t>> vec_double(sv_double.begin(), sv_double.end());
|
923
|
+
|
924
|
+
auto sv_int = sk_int.get_sorted_view(false);
|
925
|
+
std::vector<std::pair<int, uint64_t>> vec_int(sv_int.begin(), sv_int.end());
|
926
|
+
|
927
|
+
REQUIRE(vec_double.size() == vec_int.size());
|
928
|
+
|
929
|
+
for (size_t i = 0; i < vec_int.size(); ++i) {
|
930
|
+
// known truncation with conversion so approximate result
|
931
|
+
REQUIRE(vec_double[i].first == Approx(vec_int[i].first).margin(0.1));
|
932
|
+
// exact equality for weights
|
933
|
+
REQUIRE(vec_double[i].second == vec_int[i].second);
|
934
|
+
}
|
935
|
+
}
|
936
|
+
|
937
|
+
class A {
|
938
|
+
int val;
|
939
|
+
public:
|
940
|
+
A(int val): val(val) {}
|
941
|
+
int get_val() const { return val; }
|
942
|
+
};
|
943
|
+
|
944
|
+
struct less_A {
|
945
|
+
bool operator()(const A& a1, const A& a2) const { return a1.get_val() < a2.get_val(); }
|
946
|
+
};
|
947
|
+
|
948
|
+
class B {
|
949
|
+
int val;
|
950
|
+
public:
|
951
|
+
explicit B(const A& a): val(a.get_val()) {}
|
952
|
+
int get_val() const { return val; }
|
953
|
+
};
|
954
|
+
|
955
|
+
struct less_B {
|
956
|
+
bool operator()(const B& b1, const B& b2) const { return b1.get_val() < b2.get_val(); }
|
957
|
+
};
|
958
|
+
|
959
|
+
SECTION("type conversion: custom types") {
|
960
|
+
quantiles_sketch<A, less_A> sa;
|
961
|
+
sa.update(1);
|
962
|
+
sa.update(2);
|
963
|
+
sa.update(3);
|
964
|
+
|
965
|
+
quantiles_sketch<B, less_B> sb(sa);
|
966
|
+
REQUIRE(sb.get_n() == 3);
|
967
|
+
}
|
968
|
+
|
906
969
|
// cleanup
|
907
970
|
if (test_allocator_total_bytes != 0) {
|
908
971
|
REQUIRE(test_allocator_total_bytes == 0);
|
@@ -38,6 +38,9 @@ public:
|
|
38
38
|
req_compactor& operator=(const req_compactor& other);
|
39
39
|
req_compactor& operator=(req_compactor&& other);
|
40
40
|
|
41
|
+
template<typename TT, typename CC, typename AA>
|
42
|
+
req_compactor(const req_compactor<TT, CC, AA>& other, const Allocator& allocator);
|
43
|
+
|
41
44
|
bool is_sorted() const;
|
42
45
|
uint32_t get_num_items() const;
|
43
46
|
uint32_t get_nom_capacity() const;
|
@@ -128,6 +131,9 @@ private:
|
|
128
131
|
template<typename S>
|
129
132
|
static std::pair<std::unique_ptr<T, items_deleter>, size_t> deserialize_items(const void* bytes, size_t size, const S& serde, const Allocator& allocator, uint32_t num);
|
130
133
|
|
134
|
+
// for type converting constructor
|
135
|
+
template<typename TT, typename CC, typename AA>
|
136
|
+
friend class req_compactor;
|
131
137
|
};
|
132
138
|
|
133
139
|
} /* namespace datasketches */
|