datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -27,7 +27,7 @@ namespace datasketches {
|
|
|
27
27
|
using hll_sketch_test_alloc = hll_sketch_alloc<test_allocator<uint8_t>>;
|
|
28
28
|
using alloc = test_allocator<uint8_t>;
|
|
29
29
|
|
|
30
|
-
static void runCheckCopy(
|
|
30
|
+
static void runCheckCopy(uint8_t lgConfigK, target_hll_type tgtHllType) {
|
|
31
31
|
hll_sketch_test_alloc sk(lgConfigK, tgtHllType, false, 0);
|
|
32
32
|
|
|
33
33
|
for (int i = 0; i < 7; ++i) {
|
|
@@ -66,7 +66,7 @@ TEST_CASE("hll sketch: check copies", "[hll_sketch]") {
|
|
|
66
66
|
}
|
|
67
67
|
|
|
68
68
|
static void copyAs(target_hll_type srcType, target_hll_type dstType) {
|
|
69
|
-
|
|
69
|
+
uint8_t lgK = 8;
|
|
70
70
|
int n1 = 7;
|
|
71
71
|
int n2 = 24;
|
|
72
72
|
int n3 = 1000;
|
|
@@ -109,7 +109,7 @@ TEST_CASE("hll sketch: check copy as", "[hll_sketch]") {
|
|
|
109
109
|
TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
|
|
110
110
|
test_allocator_total_bytes = 0;
|
|
111
111
|
{
|
|
112
|
-
|
|
112
|
+
uint8_t lgConfigK = 8;
|
|
113
113
|
target_hll_type srcType = target_hll_type::HLL_8;
|
|
114
114
|
hll_sketch_test_alloc sk(lgConfigK, srcType, false, 0);
|
|
115
115
|
|
|
@@ -124,7 +124,7 @@ TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
|
|
|
124
124
|
sk.update(24); // HLL
|
|
125
125
|
REQUIRE(sk.get_updatable_serialization_bytes() == 40 + 256);
|
|
126
126
|
|
|
127
|
-
const
|
|
127
|
+
const auto hllBytes = hll_constants::HLL_BYTE_ARR_START + (1 << lgConfigK);
|
|
128
128
|
REQUIRE(sk.get_compact_serialization_bytes() == hllBytes);
|
|
129
129
|
REQUIRE(hll_sketch::get_max_updatable_serialization_bytes(lgConfigK, HLL_8) == hllBytes);
|
|
130
130
|
}
|
|
@@ -135,22 +135,22 @@ TEST_CASE("hll sketch: check num std dev", "[hll_sketch]") {
|
|
|
135
135
|
REQUIRE_THROWS_AS(HllUtil<>::checkNumStdDev(0), std::invalid_argument);
|
|
136
136
|
}
|
|
137
137
|
|
|
138
|
-
void checkSerializationSizes(
|
|
138
|
+
void checkSerializationSizes(uint8_t lgConfigK, target_hll_type tgtHllType) {
|
|
139
139
|
hll_sketch_test_alloc sk(lgConfigK, tgtHllType, false, 0);
|
|
140
140
|
int i;
|
|
141
141
|
|
|
142
142
|
// LIST
|
|
143
143
|
for (i = 0; i < 7; ++i) { sk.update(i); }
|
|
144
|
-
|
|
144
|
+
auto expected = hll_constants::LIST_INT_ARR_START + (i << 2);
|
|
145
145
|
REQUIRE(sk.get_compact_serialization_bytes() == expected);
|
|
146
|
-
expected =
|
|
146
|
+
expected = hll_constants::LIST_INT_ARR_START + (4 << hll_constants::LG_INIT_LIST_SIZE);
|
|
147
147
|
REQUIRE(sk.get_updatable_serialization_bytes() == expected);
|
|
148
148
|
|
|
149
149
|
// SET
|
|
150
150
|
for (i = 7; i < 24; ++i) { sk.update(i); }
|
|
151
|
-
expected =
|
|
151
|
+
expected = hll_constants::HASH_SET_INT_ARR_START + (i << 2);
|
|
152
152
|
REQUIRE(sk.get_compact_serialization_bytes() == expected);
|
|
153
|
-
expected =
|
|
153
|
+
expected = hll_constants::HASH_SET_INT_ARR_START + (4 << hll_constants::LG_INIT_SET_SIZE);
|
|
154
154
|
REQUIRE(sk.get_updatable_serialization_bytes() == expected);
|
|
155
155
|
}
|
|
156
156
|
|
|
@@ -178,7 +178,7 @@ TEST_CASE("hll sketch: exercise to string", "[hll_sketch]") {
|
|
|
178
178
|
|
|
179
179
|
// Creates and serializes then deserializes sketch.
|
|
180
180
|
// Returns true if deserialized sketch is compact.
|
|
181
|
-
static bool checkCompact(
|
|
181
|
+
static bool checkCompact(uint8_t lgK, const int n, const target_hll_type type, bool compact) {
|
|
182
182
|
hll_sketch_test_alloc sk(lgK, type, false, 0);
|
|
183
183
|
for (int i = 0; i < n; ++i) { sk.update(i); }
|
|
184
184
|
|
|
@@ -201,7 +201,7 @@ static bool checkCompact(const int lgK, const int n, const target_hll_type type,
|
|
|
201
201
|
TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
|
|
202
202
|
test_allocator_total_bytes = 0;
|
|
203
203
|
{
|
|
204
|
-
|
|
204
|
+
uint8_t lgK = 8;
|
|
205
205
|
// unless/until we create non-updatable "direct" versions,
|
|
206
206
|
// deserialized image should never be compact
|
|
207
207
|
// LIST: follows serialization request
|
|
@@ -230,10 +230,10 @@ TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
|
|
|
230
230
|
TEST_CASE("hll sketch: check k limits", "[hll_sketch]") {
|
|
231
231
|
test_allocator_total_bytes = 0;
|
|
232
232
|
{
|
|
233
|
-
hll_sketch_test_alloc sketch1(
|
|
234
|
-
hll_sketch_test_alloc sketch2(
|
|
235
|
-
REQUIRE_THROWS_AS(hll_sketch_test_alloc(
|
|
236
|
-
REQUIRE_THROWS_AS(hll_sketch_test_alloc(
|
|
233
|
+
hll_sketch_test_alloc sketch1(hll_constants::MIN_LOG_K, target_hll_type::HLL_8, false, 0);
|
|
234
|
+
hll_sketch_test_alloc sketch2(hll_constants::MAX_LOG_K, target_hll_type::HLL_4, false, 0);
|
|
235
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc(hll_constants::MIN_LOG_K - 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
|
|
236
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc(hll_constants::MAX_LOG_K + 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
|
|
237
237
|
}
|
|
238
238
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
239
239
|
}
|
|
@@ -24,23 +24,19 @@
|
|
|
24
24
|
|
|
25
25
|
namespace datasketches {
|
|
26
26
|
|
|
27
|
-
static int min(int a, int b) {
|
|
28
|
-
return (a < b) ? a : b;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
27
|
static void println(std::string& str) {
|
|
32
28
|
//std::cout << str << "\n";
|
|
33
29
|
}
|
|
34
30
|
|
|
35
31
|
static void basicUnion(uint64_t n1, uint64_t n2,
|
|
36
|
-
|
|
32
|
+
uint8_t lgk1, uint8_t lgk2, uint8_t lgMaxK,
|
|
37
33
|
target_hll_type type1, target_hll_type type2, target_hll_type resultType) {
|
|
38
34
|
uint64_t v = 0;
|
|
39
35
|
//int tot = n1 + n2;
|
|
40
36
|
|
|
41
37
|
hll_sketch h1(lgk1, type1);
|
|
42
38
|
hll_sketch h2(lgk2, type2);
|
|
43
|
-
|
|
39
|
+
uint8_t lgControlK = std::min(std::min(lgk1, lgk2), lgMaxK);
|
|
44
40
|
hll_sketch control(lgControlK, resultType);
|
|
45
41
|
|
|
46
42
|
for (uint64_t i = 0; i < n1; ++i) {
|
|
@@ -89,9 +85,9 @@ TEST_CASE("hll union: check unions", "[hll_union]") {
|
|
|
89
85
|
target_hll_type type2 = HLL_8;
|
|
90
86
|
target_hll_type resultType = HLL_8;
|
|
91
87
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
88
|
+
uint8_t lgK1 = 7;
|
|
89
|
+
uint8_t lgK2 = 7;
|
|
90
|
+
uint8_t lgMaxK = 7;
|
|
95
91
|
uint64_t n1 = 7;
|
|
96
92
|
uint64_t n2 = 7;
|
|
97
93
|
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
|
|
@@ -108,7 +104,7 @@ TEST_CASE("hll union: check unions", "[hll_union]") {
|
|
|
108
104
|
n2 = 14;
|
|
109
105
|
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
|
|
110
106
|
|
|
111
|
-
|
|
107
|
+
uint8_t i = 0;
|
|
112
108
|
for (i = 7; i <= 13; ++i) {
|
|
113
109
|
lgK1 = i;
|
|
114
110
|
lgK2 = i;
|
|
@@ -184,9 +180,9 @@ TEST_CASE("hll union: check composite estimate", "[hll_union]") {
|
|
|
184
180
|
}
|
|
185
181
|
|
|
186
182
|
TEST_CASE("hll union: check config k limits", "[hll_union]") {
|
|
187
|
-
REQUIRE_THROWS_AS(hll_union(
|
|
183
|
+
REQUIRE_THROWS_AS(hll_union(hll_constants::MIN_LOG_K - 1), std::invalid_argument);
|
|
188
184
|
|
|
189
|
-
REQUIRE_THROWS_AS(hll_union(
|
|
185
|
+
REQUIRE_THROWS_AS(hll_union(hll_constants::MAX_LOG_K + 1), std::invalid_argument);
|
|
190
186
|
}
|
|
191
187
|
|
|
192
188
|
static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est) {
|
|
@@ -195,7 +191,7 @@ static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est
|
|
|
195
191
|
}
|
|
196
192
|
|
|
197
193
|
TEST_CASE("hll union: check ub lb", "[hll_union]") {
|
|
198
|
-
|
|
194
|
+
uint8_t lgK = 4;
|
|
199
195
|
int n = 1 << 20;
|
|
200
196
|
bool oooFlag = false;
|
|
201
197
|
|
|
@@ -223,7 +219,7 @@ TEST_CASE("hll union: check ub lb", "[hll_union]") {
|
|
|
223
219
|
}
|
|
224
220
|
|
|
225
221
|
TEST_CASE("hll union: check conversions", "[hll_union]") {
|
|
226
|
-
|
|
222
|
+
uint8_t lgK = 4;
|
|
227
223
|
hll_sketch sk1(lgK, HLL_8);
|
|
228
224
|
hll_sketch sk2(lgK, HLL_8);
|
|
229
225
|
int n = 1 << 20;
|
|
@@ -57,7 +57,7 @@ static int get_n(int lg_k, hll_mode mode) {
|
|
|
57
57
|
|
|
58
58
|
static long v = 0;
|
|
59
59
|
|
|
60
|
-
static hll_sketch build_sketch(
|
|
60
|
+
static hll_sketch build_sketch(uint8_t lg_k, target_hll_type hll_type, hll_mode mode) {
|
|
61
61
|
hll_sketch sk(lg_k, hll_type);
|
|
62
62
|
int n = get_n(lg_k, mode);
|
|
63
63
|
for (int i = 0; i < n; i++) sk.update(static_cast<uint64_t>(i + v));
|
|
@@ -67,7 +67,7 @@ static hll_sketch build_sketch(int lg_k, target_hll_type hll_type, hll_mode mode
|
|
|
67
67
|
|
|
68
68
|
// merges a sketch to an empty union and gets result of the same type, checks binary equivalence
|
|
69
69
|
static void union_one_update(bool compact) {
|
|
70
|
-
for (
|
|
70
|
+
for (uint8_t lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
|
|
71
71
|
for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
|
|
72
72
|
if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
|
|
73
73
|
for (int t = 0; t <= 2; t++) { // HLL_4, HLL_6, HLL_8
|
|
@@ -102,7 +102,7 @@ TEST_CASE("hll isomorphic: union one update serialize compact", "[hll_isomorphic
|
|
|
102
102
|
|
|
103
103
|
// converts a sketch to a different type and converts back to the original type to check binary equivalence
|
|
104
104
|
static void convert_back_and_forth(bool compact) {
|
|
105
|
-
for (
|
|
105
|
+
for (uint8_t lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
|
|
106
106
|
for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
|
|
107
107
|
if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
|
|
108
108
|
for (int t1 = 0; t1 <= 2; t1++) { // HLL_4, HLL_6, HLL_8
|
|
@@ -44,11 +44,11 @@ TEST_CASE("hll to/from byte array: double serialize", "[hll_byte_array]") {
|
|
|
44
44
|
auto ser2 = sk.serialize_updatable();
|
|
45
45
|
|
|
46
46
|
REQUIRE(ser1.size() == ser2.size());
|
|
47
|
-
|
|
47
|
+
size_t len = ser1.size();
|
|
48
48
|
uint8_t* b1 = ser1.data();
|
|
49
49
|
uint8_t* b2 = ser2.data();
|
|
50
50
|
|
|
51
|
-
for (
|
|
51
|
+
for (size_t i = 0; i < len; ++i) {
|
|
52
52
|
REQUIRE(b2[i] == b1[i]);
|
|
53
53
|
}
|
|
54
54
|
}
|
|
@@ -129,7 +129,7 @@ static void checkSketchEquality(hll_sketch& sk1, hll_sketch& sk2) {
|
|
|
129
129
|
REQUIRE(sk1.get_target_type() == sk2.get_target_type());
|
|
130
130
|
}
|
|
131
131
|
|
|
132
|
-
static void toFrom(const
|
|
132
|
+
static void toFrom(const uint8_t lgConfigK, const target_hll_type tgtHllType, const int n) {
|
|
133
133
|
hll_sketch src(lgConfigK, tgtHllType);
|
|
134
134
|
for (int i = 0; i < n; ++i) {
|
|
135
135
|
src.update(i);
|
|
@@ -157,7 +157,7 @@ static void toFrom(const int lgConfigK, const target_hll_type tgtHllType, const
|
|
|
157
157
|
TEST_CASE("hll to/from byte array: to from sketch", "[hll_byte_array]") {
|
|
158
158
|
for (int i = 0; i < 10; ++i) {
|
|
159
159
|
int n = nArr[i];
|
|
160
|
-
for (
|
|
160
|
+
for (uint8_t lgK = 4; lgK <= 13; ++lgK) {
|
|
161
161
|
toFrom(lgK, HLL_4, n);
|
|
162
162
|
toFrom(lgK, HLL_6, n);
|
|
163
163
|
toFrom(lgK, HLL_8, n);
|
|
@@ -32,27 +32,17 @@ target_include_directories(kll
|
|
|
32
32
|
target_link_libraries(kll INTERFACE common)
|
|
33
33
|
target_compile_features(kll INTERFACE cxx_std_11)
|
|
34
34
|
|
|
35
|
-
set(kll_HEADERS "")
|
|
36
|
-
list(APPEND kll_HEADERS "include/kll_sketch.hpp")
|
|
37
|
-
list(APPEND kll_HEADERS "include/kll_sketch_impl.hpp")
|
|
38
|
-
list(APPEND kll_HEADERS "include/kll_helper.hpp")
|
|
39
|
-
list(APPEND kll_HEADERS "include/kll_helper_impl.hpp")
|
|
40
|
-
list(APPEND kll_HEADERS "include/kll_quantile_calculator.hpp")
|
|
41
|
-
list(APPEND kll_HEADERS "include/kll_quantile_calculator_impl.hpp")
|
|
42
|
-
|
|
43
35
|
install(TARGETS kll
|
|
44
36
|
EXPORT ${PROJECT_NAME}
|
|
45
37
|
)
|
|
46
38
|
|
|
47
|
-
install(FILES
|
|
39
|
+
install(FILES
|
|
40
|
+
include/kll_sketch.hpp
|
|
41
|
+
include/kll_sketch_impl.hpp
|
|
42
|
+
include/kll_helper.hpp
|
|
43
|
+
include/kll_helper_impl.hpp
|
|
44
|
+
include/kll_quantile_calculator.hpp
|
|
45
|
+
include/kll_quantile_calculator_impl.hpp
|
|
46
|
+
include/kolmogorov_smirnov.hpp
|
|
47
|
+
include/kolmogorov_smirnov_impl.hpp
|
|
48
48
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
49
|
-
|
|
50
|
-
target_sources(kll
|
|
51
|
-
INTERFACE
|
|
52
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper.hpp
|
|
53
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper_impl.hpp
|
|
54
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch.hpp
|
|
55
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch_impl.hpp
|
|
56
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator.hpp
|
|
57
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator_impl.hpp
|
|
58
|
-
)
|
|
@@ -26,7 +26,8 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
-
static std::independent_bits_engine<std::mt19937, 1, uint32_t>
|
|
29
|
+
static std::independent_bits_engine<std::mt19937, 1, uint32_t>
|
|
30
|
+
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()));
|
|
30
31
|
|
|
31
32
|
#ifdef KLL_VALIDATION
|
|
32
33
|
extern uint32_t kll_next_offset;
|
|
@@ -46,9 +47,9 @@ class kll_helper {
|
|
|
46
47
|
static inline uint8_t floor_of_log2_of_fraction(uint64_t numer, uint64_t denom);
|
|
47
48
|
static inline uint8_t ub_on_num_levels(uint64_t n);
|
|
48
49
|
static inline uint32_t compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels);
|
|
49
|
-
static inline
|
|
50
|
-
static inline
|
|
51
|
-
static inline
|
|
50
|
+
static inline uint16_t level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid);
|
|
51
|
+
static inline uint16_t int_cap_aux(uint16_t k, uint8_t depth);
|
|
52
|
+
static inline uint16_t int_cap_aux_aux(uint16_t k, uint8_t depth);
|
|
52
53
|
static inline uint64_t sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels);
|
|
53
54
|
|
|
54
55
|
/*
|
|
@@ -55,28 +55,28 @@ uint32_t kll_helper::compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_l
|
|
|
55
55
|
return total;
|
|
56
56
|
}
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
uint16_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
|
|
59
59
|
if (height >= numLevels) throw std::invalid_argument("height >= numLevels");
|
|
60
60
|
const uint8_t depth = numLevels - height - 1;
|
|
61
|
-
return std::max(
|
|
61
|
+
return std::max<uint16_t>(min_wid, int_cap_aux(k, depth));
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
|
|
65
65
|
if (depth > 60) throw std::invalid_argument("depth > 60");
|
|
66
66
|
if (depth <= 30) return int_cap_aux_aux(k, depth);
|
|
67
67
|
const uint8_t half = depth / 2;
|
|
68
68
|
const uint8_t rest = depth - half;
|
|
69
|
-
const
|
|
69
|
+
const uint16_t tmp = int_cap_aux_aux(k, half);
|
|
70
70
|
return int_cap_aux_aux(tmp, rest);
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
uint16_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
|
|
74
74
|
if (depth > 30) throw std::invalid_argument("depth > 30");
|
|
75
75
|
const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2
|
|
76
76
|
const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]);
|
|
77
77
|
const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2
|
|
78
78
|
if (result > k) throw std::logic_error("result > k");
|
|
79
|
-
return result;
|
|
79
|
+
return static_cast<uint16_t>(result);
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels) {
|
|
@@ -24,19 +24,27 @@
|
|
|
24
24
|
|
|
25
25
|
namespace datasketches {
|
|
26
26
|
|
|
27
|
+
// forward declaration
|
|
28
|
+
template<typename T, typename C, typename S, typename A> class kll_sketch;
|
|
29
|
+
|
|
27
30
|
template <typename T, typename C, typename A>
|
|
28
31
|
class kll_quantile_calculator {
|
|
29
32
|
public:
|
|
30
|
-
|
|
31
|
-
|
|
33
|
+
using Entry = std::pair<T, uint64_t>;
|
|
34
|
+
using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
|
|
35
|
+
using Container = std::vector<Entry, AllocEntry>;
|
|
36
|
+
using const_iterator = typename Container::const_iterator;
|
|
37
|
+
|
|
38
|
+
template<typename S>
|
|
39
|
+
kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch);
|
|
40
|
+
|
|
32
41
|
T get_quantile(double fraction) const;
|
|
42
|
+
const_iterator begin() const;
|
|
43
|
+
const_iterator end() const;
|
|
33
44
|
|
|
34
45
|
private:
|
|
35
46
|
using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
|
|
36
47
|
using vector_u32 = std::vector<uint32_t, AllocU32>;
|
|
37
|
-
using Entry = std::pair<T, uint64_t>;
|
|
38
|
-
using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
|
|
39
|
-
using Container = std::vector<Entry, AllocEntry>;
|
|
40
48
|
uint64_t n_;
|
|
41
49
|
vector_u32 levels_;
|
|
42
50
|
Container entries_;
|
|
@@ -45,7 +53,7 @@ class kll_quantile_calculator {
|
|
|
45
53
|
T approximately_answer_positional_query(uint64_t pos) const;
|
|
46
54
|
void convert_to_preceding_cummulative();
|
|
47
55
|
uint32_t chunk_containing_pos(uint64_t pos) const;
|
|
48
|
-
uint32_t search_for_chunk_containing_pos(uint64_t pos,
|
|
56
|
+
uint32_t search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const;
|
|
49
57
|
static void merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items);
|
|
50
58
|
static void merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
|
|
51
59
|
static void merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
|
|
@@ -28,24 +28,38 @@
|
|
|
28
28
|
|
|
29
29
|
namespace datasketches {
|
|
30
30
|
|
|
31
|
-
template
|
|
32
|
-
|
|
33
|
-
|
|
31
|
+
template<typename T, typename C, typename A>
|
|
32
|
+
template<typename S>
|
|
33
|
+
kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch):
|
|
34
|
+
n_(sketch.n_), levels_(sketch.num_levels_ + 1, 0, sketch.allocator_), entries_(sketch.allocator_)
|
|
34
35
|
{
|
|
35
|
-
const uint32_t num_items =
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
const uint32_t num_items = sketch.levels_[sketch.num_levels_] - sketch.levels_[0];
|
|
37
|
+
if (num_items > 0) {
|
|
38
|
+
entries_.reserve(num_items);
|
|
39
|
+
populate_from_sketch(sketch.items_, sketch.levels_.data(), sketch.num_levels_);
|
|
40
|
+
if (!sketch.is_level_zero_sorted_) std::sort(entries_.begin(), entries_.begin() + levels_[1], compare_pair_by_first<C>());
|
|
41
|
+
merge_sorted_blocks(entries_, levels_.data(), static_cast<uint8_t>(levels_.size()) - 1, num_items);
|
|
42
|
+
if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
|
|
43
|
+
convert_to_preceding_cummulative();
|
|
44
|
+
}
|
|
41
45
|
}
|
|
42
46
|
|
|
43
|
-
template
|
|
47
|
+
template<typename T, typename C, typename A>
|
|
44
48
|
T kll_quantile_calculator<T, C, A>::get_quantile(double fraction) const {
|
|
45
49
|
return approximately_answer_positional_query(pos_of_phi(fraction, n_));
|
|
46
50
|
}
|
|
47
51
|
|
|
48
|
-
template
|
|
52
|
+
template<typename T, typename C, typename A>
|
|
53
|
+
auto kll_quantile_calculator<T, C, A>::begin() const -> const_iterator {
|
|
54
|
+
return entries_.begin();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
template<typename T, typename C, typename A>
|
|
58
|
+
auto kll_quantile_calculator<T, C, A>::end() const -> const_iterator {
|
|
59
|
+
return entries_.end();
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
template<typename T, typename C, typename A>
|
|
49
63
|
void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, const uint32_t* levels, uint8_t num_levels) {
|
|
50
64
|
size_t src_level = 0;
|
|
51
65
|
size_t dst_level = 0;
|
|
@@ -68,7 +82,7 @@ void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, cons
|
|
|
68
82
|
if (levels_.size() > static_cast<size_t>(dst_level + 1)) levels_.resize(dst_level + 1);
|
|
69
83
|
}
|
|
70
84
|
|
|
71
|
-
template
|
|
85
|
+
template<typename T, typename C, typename A>
|
|
72
86
|
T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64_t pos) const {
|
|
73
87
|
if (pos >= n_) throw std::logic_error("position out of range");
|
|
74
88
|
const uint32_t num_items = levels_[levels_.size() - 1];
|
|
@@ -77,7 +91,7 @@ T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64
|
|
|
77
91
|
return entries_[index].first;
|
|
78
92
|
}
|
|
79
93
|
|
|
80
|
-
template
|
|
94
|
+
template<typename T, typename C, typename A>
|
|
81
95
|
void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
|
|
82
96
|
uint64_t subtotal = 0;
|
|
83
97
|
for (auto& entry: entries_) {
|
|
@@ -87,13 +101,13 @@ void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
|
|
|
87
101
|
}
|
|
88
102
|
}
|
|
89
103
|
|
|
90
|
-
template
|
|
104
|
+
template<typename T, typename C, typename A>
|
|
91
105
|
uint64_t kll_quantile_calculator<T, C, A>::pos_of_phi(double phi, uint64_t n) {
|
|
92
|
-
const uint64_t pos = std::floor(phi * n);
|
|
106
|
+
const uint64_t pos = static_cast<uint64_t>(std::floor(phi * n));
|
|
93
107
|
return (pos == n) ? n - 1 : pos;
|
|
94
108
|
}
|
|
95
109
|
|
|
96
|
-
template
|
|
110
|
+
template<typename T, typename C, typename A>
|
|
97
111
|
uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) const {
|
|
98
112
|
if (entries_.size() < 1) throw std::logic_error("array too short");
|
|
99
113
|
if (pos < entries_[0].second) throw std::logic_error("position too small");
|
|
@@ -101,19 +115,19 @@ uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) co
|
|
|
101
115
|
return search_for_chunk_containing_pos(pos, 0, entries_.size());
|
|
102
116
|
}
|
|
103
117
|
|
|
104
|
-
template
|
|
105
|
-
uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos,
|
|
118
|
+
template<typename T, typename C, typename A>
|
|
119
|
+
uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const {
|
|
106
120
|
if (l + 1 == r) {
|
|
107
|
-
return l;
|
|
121
|
+
return static_cast<uint32_t>(l);
|
|
108
122
|
}
|
|
109
|
-
const
|
|
123
|
+
const uint64_t m = l + (r - l) / 2;
|
|
110
124
|
if (entries_[m].second <= pos) {
|
|
111
125
|
return search_for_chunk_containing_pos(pos, m, r);
|
|
112
126
|
}
|
|
113
127
|
return search_for_chunk_containing_pos(pos, l, m);
|
|
114
128
|
}
|
|
115
129
|
|
|
116
|
-
template
|
|
130
|
+
template<typename T, typename C, typename A>
|
|
117
131
|
void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items) {
|
|
118
132
|
if (num_levels == 1) return;
|
|
119
133
|
Container temporary(entries.get_allocator());
|
|
@@ -121,7 +135,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, c
|
|
|
121
135
|
merge_sorted_blocks_direct(entries, temporary, levels, 0, num_levels);
|
|
122
136
|
}
|
|
123
137
|
|
|
124
|
-
template
|
|
138
|
+
template<typename T, typename C, typename A>
|
|
125
139
|
void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels,
|
|
126
140
|
uint8_t starting_level, uint8_t num_levels) {
|
|
127
141
|
if (num_levels == 1) return;
|
|
@@ -129,10 +143,11 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
|
|
|
129
143
|
const uint8_t num_levels_2 = num_levels - num_levels_1;
|
|
130
144
|
const uint8_t starting_level_1 = starting_level;
|
|
131
145
|
const uint8_t starting_level_2 = starting_level + num_levels_1;
|
|
132
|
-
const auto
|
|
146
|
+
const auto initial_size = temp.size();
|
|
133
147
|
merge_sorted_blocks_reversed(orig, temp, levels, starting_level_1, num_levels_1);
|
|
134
148
|
merge_sorted_blocks_reversed(orig, temp, levels, starting_level_2, num_levels_2);
|
|
135
149
|
const uint32_t num_items_1 = levels[starting_level_1 + num_levels_1] - levels[starting_level_1];
|
|
150
|
+
const auto chunk_begin = temp.begin() + initial_size;
|
|
136
151
|
std::merge(
|
|
137
152
|
std::make_move_iterator(chunk_begin), std::make_move_iterator(chunk_begin + num_items_1),
|
|
138
153
|
std::make_move_iterator(chunk_begin + num_items_1), std::make_move_iterator(temp.end()),
|
|
@@ -141,7 +156,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
|
|
|
141
156
|
temp.erase(chunk_begin, temp.end());
|
|
142
157
|
}
|
|
143
158
|
|
|
144
|
-
template
|
|
159
|
+
template<typename T, typename C, typename A>
|
|
145
160
|
void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels,
|
|
146
161
|
uint8_t starting_level, uint8_t num_levels) {
|
|
147
162
|
if (num_levels == 1) {
|
|
@@ -153,15 +153,23 @@ template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
|
|
|
153
153
|
template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
|
|
154
154
|
template<typename A> using vector_d = std::vector<double, AllocD<A>>;
|
|
155
155
|
|
|
156
|
+
namespace kll_constants {
|
|
157
|
+
const uint16_t DEFAULT_K = 200;
|
|
158
|
+
}
|
|
159
|
+
|
|
156
160
|
template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
|
|
157
161
|
class kll_sketch {
|
|
158
162
|
public:
|
|
163
|
+
using value_type = T;
|
|
164
|
+
using comparator = C;
|
|
165
|
+
|
|
159
166
|
static const uint8_t DEFAULT_M = 8;
|
|
160
|
-
|
|
167
|
+
// TODO: Redundant and deprecated. Will be remove din next major version.
|
|
168
|
+
static const uint16_t DEFAULT_K = kll_constants::DEFAULT_K;
|
|
161
169
|
static const uint16_t MIN_K = DEFAULT_M;
|
|
162
170
|
static const uint16_t MAX_K = (1 << 16) - 1;
|
|
163
171
|
|
|
164
|
-
explicit kll_sketch(uint16_t k = DEFAULT_K, const A& allocator = A());
|
|
172
|
+
explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const A& allocator = A());
|
|
165
173
|
kll_sketch(const kll_sketch& other);
|
|
166
174
|
kll_sketch(kll_sketch&& other) noexcept;
|
|
167
175
|
~kll_sketch();
|
|
@@ -296,7 +304,7 @@ class kll_sketch {
|
|
|
296
304
|
*
|
|
297
305
|
* @return array of approximations to the given number of evenly-spaced fractional ranks.
|
|
298
306
|
*/
|
|
299
|
-
std::vector<T, A> get_quantiles(
|
|
307
|
+
std::vector<T, A> get_quantiles(uint32_t num) const;
|
|
300
308
|
|
|
301
309
|
/**
|
|
302
310
|
* Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
|
|
@@ -383,6 +391,33 @@ class kll_sketch {
|
|
|
383
391
|
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
384
392
|
size_t get_serialized_size_bytes() const;
|
|
385
393
|
|
|
394
|
+
/**
|
|
395
|
+
* Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
|
|
396
|
+
* length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
|
|
397
|
+
* This method can be used if allocation of storage is necessary beforehand, but it is not
|
|
398
|
+
* optimal.
|
|
399
|
+
* This method is for arithmetic types (integral and floating point)
|
|
400
|
+
* @param k parameter that controls size of the sketch and accuracy of estimates
|
|
401
|
+
* @param n stream length
|
|
402
|
+
* @return upper bound on the serialized size
|
|
403
|
+
*/
|
|
404
|
+
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
405
|
+
static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n);
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
|
|
409
|
+
* length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
|
|
410
|
+
* This method can be used if allocation of storage is necessary beforehand, but it is not
|
|
411
|
+
* optimal.
|
|
412
|
+
* This method is for all other non-arithmetic types, and it takes a max size of an item as input.
|
|
413
|
+
* @param k parameter that controls size of the sketch and accuracy of estimates
|
|
414
|
+
* @param n stream length
|
|
415
|
+
* @param max_item_size_bytes maximum size of an item in bytes
|
|
416
|
+
* @return upper bound on the serialized size
|
|
417
|
+
*/
|
|
418
|
+
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
419
|
+
static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes);
|
|
420
|
+
|
|
386
421
|
/**
|
|
387
422
|
* This method serializes the sketch into a given stream in a binary form
|
|
388
423
|
* @param os output stream
|
|
@@ -391,7 +426,7 @@ class kll_sketch {
|
|
|
391
426
|
|
|
392
427
|
// This is a convenience alias for users
|
|
393
428
|
// The type returned by the following serialize method
|
|
394
|
-
|
|
429
|
+
using vector_bytes = vector_u8<A>;
|
|
395
430
|
|
|
396
431
|
/**
|
|
397
432
|
* This method serializes the sketch as a vector of bytes.
|
|
@@ -480,6 +515,8 @@ class kll_sketch {
|
|
|
480
515
|
T* max_value_;
|
|
481
516
|
bool is_level_zero_sorted_;
|
|
482
517
|
|
|
518
|
+
friend class kll_quantile_calculator<T, C, A>;
|
|
519
|
+
|
|
483
520
|
// for deserialization
|
|
484
521
|
class item_deleter;
|
|
485
522
|
class items_deleter;
|