datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <cmath>
|
|
21
|
+
#include <catch2/catch.hpp>
|
|
22
|
+
|
|
23
|
+
#include <density_sketch.hpp>
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
TEST_CASE("density sketch: empty", "[density_sketch]") {
|
|
28
|
+
density_sketch<float> sketch(10, 3);
|
|
29
|
+
REQUIRE(sketch.is_empty());
|
|
30
|
+
REQUIRE_THROWS_AS(sketch.get_estimate({0, 0, 0}), std::runtime_error);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
TEST_CASE("density sketch: one item", "[density_sketch]") {
|
|
34
|
+
density_sketch<float> sketch(10, 3);
|
|
35
|
+
|
|
36
|
+
// dimension mismatch
|
|
37
|
+
REQUIRE_THROWS_AS(sketch.update(std::vector<float>({0, 0})), std::invalid_argument);
|
|
38
|
+
|
|
39
|
+
sketch.update(std::vector<float>({0, 0, 0}));
|
|
40
|
+
REQUIRE_FALSE(sketch.is_empty());
|
|
41
|
+
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
42
|
+
REQUIRE(sketch.get_estimate({0, 0, 0}) == 1);
|
|
43
|
+
REQUIRE(sketch.get_estimate({0.01, 0.01, 0.01}) > 0.95);
|
|
44
|
+
REQUIRE(sketch.get_estimate({1, 1, 1}) < 0.05);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
TEST_CASE("density sketch: merge", "[density_sketch]") {
|
|
48
|
+
density_sketch<float> sketch1(10, 4);
|
|
49
|
+
sketch1.update(std::vector<float>({0, 0, 0, 0}));
|
|
50
|
+
sketch1.update(std::vector<float>({1, 2, 3, 4}));
|
|
51
|
+
|
|
52
|
+
density_sketch<float> sketch2(10, 4);
|
|
53
|
+
sketch2.update(std::vector<float>({5, 6, 7, 8}));
|
|
54
|
+
|
|
55
|
+
sketch1.merge(sketch2);
|
|
56
|
+
|
|
57
|
+
REQUIRE(sketch1.get_n() == 3);
|
|
58
|
+
REQUIRE(sketch1.get_num_retained() == 3);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
TEST_CASE("density sketch: iterator", "[density_sketch]") {
|
|
62
|
+
density_sketch<float> sketch(10, 3);
|
|
63
|
+
unsigned n = 1000;
|
|
64
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(std::vector<float>(3, i));
|
|
65
|
+
REQUIRE(sketch.get_n() == n);
|
|
66
|
+
REQUIRE(sketch.is_estimation_mode());
|
|
67
|
+
//std::cout << sketch.to_string(true, true);
|
|
68
|
+
unsigned count = 0;
|
|
69
|
+
for (auto pair: sketch) {
|
|
70
|
+
++count;
|
|
71
|
+
// just to assert something about the output
|
|
72
|
+
REQUIRE(pair.first.size() == sketch.get_dim());
|
|
73
|
+
}
|
|
74
|
+
REQUIRE(count == sketch.get_num_retained());
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// spherical kernel for testing, returns 1 for vectors within radius and 0 otherwise
|
|
78
|
+
template<typename T>
|
|
79
|
+
struct spherical_kernel {
|
|
80
|
+
spherical_kernel(T radius = 1.0) : _radius_squared(radius * radius) {}
|
|
81
|
+
T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
|
|
82
|
+
return std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}) <= _radius_squared ? 1.0 : 0.0;
|
|
83
|
+
}
|
|
84
|
+
private:
|
|
85
|
+
T _radius_squared;
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
TEST_CASE("custom kernel", "[density_sketch]") {
|
|
89
|
+
density_sketch<float, spherical_kernel<float>> sketch(10, 3, spherical_kernel<float>(0.5));
|
|
90
|
+
|
|
91
|
+
// update with (1,1,1) and test points inside and outside the kernel
|
|
92
|
+
sketch.update(std::vector<float>(3, 1.0));
|
|
93
|
+
REQUIRE(sketch.get_estimate(std::vector<float>(3, 1.001)) == 1.0);
|
|
94
|
+
REQUIRE(sketch.get_estimate(std::vector<float>(3, 2.0)) == 0.0);
|
|
95
|
+
|
|
96
|
+
// rest of test follows iterator test above
|
|
97
|
+
unsigned n = 1000;
|
|
98
|
+
for (unsigned i = 2; i <= n; ++i) sketch.update(std::vector<float>(3, i));
|
|
99
|
+
REQUIRE(sketch.get_n() == n);
|
|
100
|
+
REQUIRE(sketch.is_estimation_mode());
|
|
101
|
+
unsigned count = 0;
|
|
102
|
+
for (auto pair: sketch) {
|
|
103
|
+
++count;
|
|
104
|
+
// just to assert something about the output
|
|
105
|
+
REQUIRE(pair.first.size() == sketch.get_dim());
|
|
106
|
+
}
|
|
107
|
+
REQUIRE(count == sketch.get_num_retained());
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
TEST_CASE("serialize empty", "[density_sketch]") {
|
|
111
|
+
density_sketch<double> sk(10, 2);
|
|
112
|
+
auto bytes = sk.serialize();
|
|
113
|
+
auto sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
|
|
114
|
+
REQUIRE(sk2.is_empty());
|
|
115
|
+
REQUIRE(!sk2.is_estimation_mode());
|
|
116
|
+
REQUIRE(sk.get_k() == sk2.get_k());
|
|
117
|
+
REQUIRE(sk.get_dim() == sk2.get_dim());
|
|
118
|
+
REQUIRE(sk.get_n() == sk2.get_n());
|
|
119
|
+
REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
|
|
120
|
+
|
|
121
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
122
|
+
sk.serialize(s);
|
|
123
|
+
auto sk3 = density_sketch<double>::deserialize(s);
|
|
124
|
+
REQUIRE(sk3.is_empty());
|
|
125
|
+
REQUIRE(!sk3.is_estimation_mode());
|
|
126
|
+
REQUIRE(sk.get_k() == sk3.get_k());
|
|
127
|
+
REQUIRE(sk.get_dim() == sk3.get_dim());
|
|
128
|
+
REQUIRE(sk.get_n() == sk3.get_n());
|
|
129
|
+
REQUIRE(sk.get_num_retained() == sk3.get_num_retained());
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
TEST_CASE("serialize bytes", "[density_sketch]") {
|
|
133
|
+
uint16_t k = 10;
|
|
134
|
+
uint32_t dim = 3;
|
|
135
|
+
density_sketch<double> sk(k, dim);
|
|
136
|
+
|
|
137
|
+
for (uint16_t i = 0; i < k; ++i) {
|
|
138
|
+
double val = static_cast<double>(i);
|
|
139
|
+
sk.update(std::vector<double>({val, std::sqrt(val), -val}));
|
|
140
|
+
}
|
|
141
|
+
REQUIRE(!sk.is_estimation_mode());
|
|
142
|
+
|
|
143
|
+
// exact mode
|
|
144
|
+
auto bytes = sk.serialize();
|
|
145
|
+
auto sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
|
|
146
|
+
REQUIRE(!sk2.is_empty());
|
|
147
|
+
REQUIRE(!sk2.is_estimation_mode());
|
|
148
|
+
REQUIRE(sk.get_k() == sk2.get_k());
|
|
149
|
+
REQUIRE(sk.get_dim() == sk2.get_dim());
|
|
150
|
+
REQUIRE(sk.get_n() == sk2.get_n());
|
|
151
|
+
REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
|
|
152
|
+
auto it1 = sk.begin();
|
|
153
|
+
auto it2 = sk2.begin();
|
|
154
|
+
while (it1 != sk.end()) {
|
|
155
|
+
REQUIRE(it1->first[0] == it2->first[0]);
|
|
156
|
+
REQUIRE(it1->second == it2->second);
|
|
157
|
+
++it1;
|
|
158
|
+
++it2;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// estimation mode
|
|
162
|
+
size_t n = 1031;
|
|
163
|
+
for (uint32_t i = k; i < n; ++i) {
|
|
164
|
+
double val = static_cast<double>(i);
|
|
165
|
+
sk.update(std::vector<double>({val, std::sqrt(val), -val}));
|
|
166
|
+
}
|
|
167
|
+
REQUIRE(sk.is_estimation_mode());
|
|
168
|
+
|
|
169
|
+
bytes = sk.serialize();
|
|
170
|
+
sk2 = density_sketch<double>::deserialize(bytes.data(), bytes.size());
|
|
171
|
+
REQUIRE(!sk2.is_empty());
|
|
172
|
+
REQUIRE(sk2.is_estimation_mode());
|
|
173
|
+
REQUIRE(sk.get_k() == sk2.get_k());
|
|
174
|
+
REQUIRE(sk.get_dim() == sk2.get_dim());
|
|
175
|
+
REQUIRE(sk.get_n() == sk2.get_n());
|
|
176
|
+
REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
|
|
177
|
+
it1 = sk.begin();
|
|
178
|
+
it2 = sk2.begin();
|
|
179
|
+
while (it1 != sk.end()) {
|
|
180
|
+
REQUIRE(it1->first[0] == it2->first[0]);
|
|
181
|
+
REQUIRE(it1->second == it2->second);
|
|
182
|
+
++it1;
|
|
183
|
+
++it2;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
TEST_CASE("serialize stream", "[density_sketch]") {
|
|
188
|
+
uint16_t k = 10;
|
|
189
|
+
uint32_t dim = 3;
|
|
190
|
+
density_sketch<float> sk(k, dim);
|
|
191
|
+
|
|
192
|
+
for (uint16_t i = 0; i < k; ++i) {
|
|
193
|
+
float val = static_cast<float>(i);
|
|
194
|
+
sk.update(std::vector<float>({val, std::sin(val), std::cos(val)}));
|
|
195
|
+
}
|
|
196
|
+
REQUIRE(!sk.is_estimation_mode());
|
|
197
|
+
|
|
198
|
+
// exact mode
|
|
199
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
200
|
+
sk.serialize(s);
|
|
201
|
+
auto sk2 = density_sketch<float>::deserialize(s);
|
|
202
|
+
REQUIRE(!sk2.is_empty());
|
|
203
|
+
REQUIRE(!sk2.is_estimation_mode());
|
|
204
|
+
REQUIRE(sk.get_k() == sk2.get_k());
|
|
205
|
+
REQUIRE(sk.get_dim() == sk2.get_dim());
|
|
206
|
+
REQUIRE(sk.get_n() == sk2.get_n());
|
|
207
|
+
REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
|
|
208
|
+
auto it1 = sk.begin();
|
|
209
|
+
auto it2 = sk2.begin();
|
|
210
|
+
while (it1 != sk.end()) {
|
|
211
|
+
REQUIRE(it1->first[0] == it2->first[0]);
|
|
212
|
+
REQUIRE(it1->second == it2->second);
|
|
213
|
+
++it1;
|
|
214
|
+
++it2;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// estimation mode
|
|
218
|
+
size_t n = 1031;
|
|
219
|
+
for (uint32_t i = k; i < n; ++i) {
|
|
220
|
+
float val = static_cast<float>(i);
|
|
221
|
+
sk.update(std::vector<float>({val, std::sqrt(val), -val}));
|
|
222
|
+
}
|
|
223
|
+
REQUIRE(sk.is_estimation_mode());
|
|
224
|
+
|
|
225
|
+
std::stringstream s2(std::ios::in | std::ios::out | std::ios::binary);
|
|
226
|
+
sk.serialize(s2);
|
|
227
|
+
sk2 = density_sketch<float>::deserialize(s2);
|
|
228
|
+
REQUIRE(!sk2.is_empty());
|
|
229
|
+
REQUIRE(sk2.is_estimation_mode());
|
|
230
|
+
REQUIRE(sk.get_k() == sk2.get_k());
|
|
231
|
+
REQUIRE(sk.get_dim() == sk2.get_dim());
|
|
232
|
+
REQUIRE(sk.get_n() == sk2.get_n());
|
|
233
|
+
REQUIRE(sk.get_num_retained() == sk2.get_num_retained());
|
|
234
|
+
it1 = sk.begin();
|
|
235
|
+
it2 = sk2.begin();
|
|
236
|
+
while (it1 != sk.end()) {
|
|
237
|
+
REQUIRE(it1->first[0] == it2->first[0]);
|
|
238
|
+
REQUIRE(it1->second == it2->second);
|
|
239
|
+
++it1;
|
|
240
|
+
++it2;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
} /* namespace datasketches */
|
|
@@ -91,8 +91,14 @@ private:
|
|
|
91
91
|
|
|
92
92
|
// This iterator uses strides based on golden ratio to avoid clustering during merge
|
|
93
93
|
template<typename K, typename V, typename H, typename E, typename A>
|
|
94
|
-
class reverse_purge_hash_map<K, V, H, E, A>::iterator
|
|
94
|
+
class reverse_purge_hash_map<K, V, H, E, A>::iterator {
|
|
95
95
|
public:
|
|
96
|
+
using iterator_category = std::input_iterator_tag;
|
|
97
|
+
using value_type = std::pair<K&, V>;
|
|
98
|
+
using difference_type = void;
|
|
99
|
+
using pointer = void;
|
|
100
|
+
using reference = const value_type;
|
|
101
|
+
|
|
96
102
|
friend class reverse_purge_hash_map<K, V, H, E, A>;
|
|
97
103
|
iterator& operator++() {
|
|
98
104
|
++count;
|
|
@@ -107,8 +113,8 @@ public:
|
|
|
107
113
|
iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; }
|
|
108
114
|
bool operator==(const iterator& rhs) const { return count == rhs.count; }
|
|
109
115
|
bool operator!=(const iterator& rhs) const { return count != rhs.count; }
|
|
110
|
-
|
|
111
|
-
return
|
|
116
|
+
reference operator*() const {
|
|
117
|
+
return value_type(map->keys_[index], map->values_[index]);
|
|
112
118
|
}
|
|
113
119
|
private:
|
|
114
120
|
static constexpr double GOLDEN_RATIO_RECIPROCAL = 0.6180339887498949; // = (sqrt(5) - 1) / 2
|
|
@@ -51,6 +51,22 @@ Hll4Array<A>::Hll4Array(const Hll4Array<A>& that) :
|
|
|
51
51
|
}
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
+
template<typename A>
|
|
55
|
+
Hll4Array<A>::Hll4Array(const HllArray<A>& other) :
|
|
56
|
+
HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_4, other.isStartFullSize(), other.getAllocator()),
|
|
57
|
+
auxHashMap_(nullptr)
|
|
58
|
+
{
|
|
59
|
+
const int numBytes = this->hll4ArrBytes(this->lgConfigK_);
|
|
60
|
+
this->hllByteArr_.resize(numBytes, 0);
|
|
61
|
+
this->oooFlag_ = other.isOutOfOrderFlag();
|
|
62
|
+
|
|
63
|
+
for (const auto coupon : other) { // all = false, so skip empty values
|
|
64
|
+
internalCouponUpdate(coupon); // updates KxQ registers
|
|
65
|
+
}
|
|
66
|
+
this->hipAccum_ = other.getHipAccum();
|
|
67
|
+
this->rebuild_kxq_curmin_ = false;
|
|
68
|
+
}
|
|
69
|
+
|
|
54
70
|
template<typename A>
|
|
55
71
|
Hll4Array<A>::~Hll4Array() {
|
|
56
72
|
// hllByteArr deleted in parent
|
|
@@ -114,10 +130,9 @@ uint8_t Hll4Array<A>::getSlot(uint32_t slotNo) const {
|
|
|
114
130
|
}
|
|
115
131
|
|
|
116
132
|
template<typename A>
|
|
117
|
-
uint8_t Hll4Array<A>::
|
|
118
|
-
const uint8_t value = getSlot(index);
|
|
133
|
+
uint8_t Hll4Array<A>::adjustRawValue(uint32_t slot, uint8_t value) const {
|
|
119
134
|
if (value != hll_constants::AUX_TOKEN) return value + this->curMin_;
|
|
120
|
-
return auxHashMap_->mustFindValueFor(
|
|
135
|
+
return auxHashMap_->mustFindValueFor(slot);
|
|
121
136
|
}
|
|
122
137
|
|
|
123
138
|
template<typename A>
|
|
@@ -210,7 +225,7 @@ void Hll4Array<A>::internalHll4Update(uint32_t slotNo, uint8_t newVal) {
|
|
|
210
225
|
|
|
211
226
|
// we just increased a pair value, so it might be time to change curMin
|
|
212
227
|
if (actualOldValue == this->curMin_) { // 908
|
|
213
|
-
this->
|
|
228
|
+
--(this->numAtCurMin_);
|
|
214
229
|
while (this->numAtCurMin_ == 0) {
|
|
215
230
|
shiftToBiggerCurMin(); // increases curMin by 1, builds a new aux table
|
|
216
231
|
// shifts values in 4-bit table and recounts curMin
|
|
@@ -328,13 +343,6 @@ typename HllArray<A>::const_iterator Hll4Array<A>::end() const {
|
|
|
328
343
|
this->tgtHllType_, auxHashMap_, this->curMin_, false);
|
|
329
344
|
}
|
|
330
345
|
|
|
331
|
-
template<typename A>
|
|
332
|
-
void Hll4Array<A>::mergeHll(const HllArray<A>& src) {
|
|
333
|
-
for (const auto coupon: src) {
|
|
334
|
-
internalCouponUpdate(coupon);
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
|
|
338
346
|
}
|
|
339
347
|
|
|
340
348
|
#endif // _HLL4ARRAY_INTERNAL_HPP_
|
|
@@ -25,14 +25,12 @@
|
|
|
25
25
|
|
|
26
26
|
namespace datasketches {
|
|
27
27
|
|
|
28
|
-
template<typename A>
|
|
29
|
-
class Hll4Iterator;
|
|
30
|
-
|
|
31
28
|
template<typename A>
|
|
32
29
|
class Hll4Array final : public HllArray<A> {
|
|
33
30
|
public:
|
|
34
31
|
explicit Hll4Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
|
|
35
32
|
explicit Hll4Array(const Hll4Array<A>& that);
|
|
33
|
+
explicit Hll4Array(const HllArray<A>& that);
|
|
36
34
|
|
|
37
35
|
virtual ~Hll4Array();
|
|
38
36
|
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
|
|
@@ -41,13 +39,12 @@ class Hll4Array final : public HllArray<A> {
|
|
|
41
39
|
|
|
42
40
|
inline uint8_t getSlot(uint32_t slotNo) const;
|
|
43
41
|
inline void putSlot(uint32_t slotNo, uint8_t value);
|
|
44
|
-
inline uint8_t
|
|
42
|
+
inline uint8_t adjustRawValue(uint32_t index, uint8_t value) const;
|
|
45
43
|
|
|
46
44
|
virtual uint32_t getUpdatableSerializationBytes() const;
|
|
47
45
|
virtual uint32_t getHllByteArrBytes() const;
|
|
48
46
|
|
|
49
47
|
virtual HllSketchImpl<A>* couponUpdate(uint32_t coupon) final;
|
|
50
|
-
void mergeHll(const HllArray<A>& src);
|
|
51
48
|
|
|
52
49
|
virtual AuxHashMap<A>* getAuxHashMap() const;
|
|
53
50
|
// does *not* delete old map if overwriting
|
|
@@ -34,6 +34,25 @@ HllArray<A>(lgConfigK, target_hll_type::HLL_6, startFullSize, allocator)
|
|
|
34
34
|
this->hllByteArr_.resize(numBytes, 0);
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
+
template<typename A>
|
|
38
|
+
Hll6Array<A>::Hll6Array(const HllArray<A>& other) :
|
|
39
|
+
HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_6, other.isStartFullSize(), other.getAllocator())
|
|
40
|
+
{
|
|
41
|
+
const int numBytes = this->hll6ArrBytes(this->lgConfigK_);
|
|
42
|
+
this->hllByteArr_.resize(numBytes, 0);
|
|
43
|
+
this->oooFlag_ = other.isOutOfOrderFlag();
|
|
44
|
+
uint32_t num_zeros = 1 << this->lgConfigK_;
|
|
45
|
+
|
|
46
|
+
for (const auto coupon : other) { // all = false, so skip empty values
|
|
47
|
+
num_zeros--;
|
|
48
|
+
internalCouponUpdate(coupon); // updates KxQ registers
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
this->numAtCurMin_ = num_zeros;
|
|
52
|
+
this->hipAccum_ = other.getHipAccum();
|
|
53
|
+
this->rebuild_kxq_curmin_ = false;
|
|
54
|
+
}
|
|
55
|
+
|
|
37
56
|
template<typename A>
|
|
38
57
|
std::function<void(HllSketchImpl<A>*)> Hll6Array<A>::get_deleter() const {
|
|
39
58
|
return [](HllSketchImpl<A>* ptr) {
|
|
@@ -101,13 +120,6 @@ void Hll6Array<A>::internalCouponUpdate(uint32_t coupon) {
|
|
|
101
120
|
}
|
|
102
121
|
}
|
|
103
122
|
|
|
104
|
-
template<typename A>
|
|
105
|
-
void Hll6Array<A>::mergeHll(const HllArray<A>& src) {
|
|
106
|
-
for (const auto coupon: src) {
|
|
107
|
-
internalCouponUpdate(coupon);
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
|
|
111
123
|
}
|
|
112
124
|
|
|
113
125
|
#endif // _HLL6ARRAY_INTERNAL_HPP_
|
|
@@ -31,6 +31,7 @@ template<typename A>
|
|
|
31
31
|
class Hll6Array final : public HllArray<A> {
|
|
32
32
|
public:
|
|
33
33
|
Hll6Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
|
|
34
|
+
explicit Hll6Array(const HllArray<A>& that);
|
|
34
35
|
|
|
35
36
|
virtual ~Hll6Array() = default;
|
|
36
37
|
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
|
|
@@ -41,7 +42,6 @@ class Hll6Array final : public HllArray<A> {
|
|
|
41
42
|
inline void putSlot(uint32_t slotNo, uint8_t value);
|
|
42
43
|
|
|
43
44
|
virtual HllSketchImpl<A>* couponUpdate(uint32_t coupon) final;
|
|
44
|
-
void mergeHll(const HllArray<A>& src);
|
|
45
45
|
|
|
46
46
|
virtual uint32_t getHllByteArrBytes() const;
|
|
47
47
|
|
|
@@ -32,6 +32,25 @@ HllArray<A>(lgConfigK, target_hll_type::HLL_8, startFullSize, allocator)
|
|
|
32
32
|
this->hllByteArr_.resize(numBytes, 0);
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
+
template<typename A>
|
|
36
|
+
Hll8Array<A>::Hll8Array(const HllArray<A>& other):
|
|
37
|
+
HllArray<A>(other.getLgConfigK(), target_hll_type::HLL_8, other.isStartFullSize(), other.getAllocator())
|
|
38
|
+
{
|
|
39
|
+
const int numBytes = this->hll8ArrBytes(this->lgConfigK_);
|
|
40
|
+
this->hllByteArr_.resize(numBytes, 0);
|
|
41
|
+
this->oooFlag_ = other.isOutOfOrderFlag();
|
|
42
|
+
uint32_t num_zeros = 1 << this->lgConfigK_;
|
|
43
|
+
|
|
44
|
+
for (const auto coupon : other) { // all = false, so skip empty values
|
|
45
|
+
num_zeros--;
|
|
46
|
+
internalCouponUpdate(coupon); // updates KxQ registers
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
this->numAtCurMin_ = num_zeros;
|
|
50
|
+
this->hipAccum_ = other.getHipAccum();
|
|
51
|
+
this->rebuild_kxq_curmin_ = false;
|
|
52
|
+
}
|
|
53
|
+
|
|
35
54
|
template<typename A>
|
|
36
55
|
std::function<void(HllSketchImpl<A>*)> Hll8Array<A>::get_deleter() const {
|
|
37
56
|
return [](HllSketchImpl<A>* ptr) {
|
|
@@ -77,13 +96,11 @@ void Hll8Array<A>::internalCouponUpdate(uint32_t coupon) {
|
|
|
77
96
|
const uint32_t slotNo = HllUtil<A>::getLow26(coupon) & configKmask;
|
|
78
97
|
const uint8_t newVal = HllUtil<A>::getValue(coupon);
|
|
79
98
|
|
|
80
|
-
const uint8_t curVal =
|
|
99
|
+
const uint8_t curVal = this->hllByteArr_[slotNo];
|
|
81
100
|
if (newVal > curVal) {
|
|
82
|
-
|
|
101
|
+
this->hllByteArr_[slotNo] = newVal;
|
|
83
102
|
this->hipAndKxQIncrementalUpdate(curVal, newVal);
|
|
84
|
-
|
|
85
|
-
this->numAtCurMin_--; // interpret numAtCurMin as num zeros
|
|
86
|
-
}
|
|
103
|
+
this->numAtCurMin_ -= curVal == 0; // interpret numAtCurMin as num zeros
|
|
87
104
|
}
|
|
88
105
|
}
|
|
89
106
|
|
|
@@ -97,49 +114,88 @@ void Hll8Array<A>::mergeList(const CouponList<A>& src) {
|
|
|
97
114
|
template<typename A>
|
|
98
115
|
void Hll8Array<A>::mergeHll(const HllArray<A>& src) {
|
|
99
116
|
// at this point src_k >= dst_k
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
const uint8_t old_v = this->hllByteArr_[j];
|
|
108
|
-
if (new_v > old_v) {
|
|
109
|
-
this->hllByteArr_[j] = new_v;
|
|
110
|
-
this->hipAndKxQIncrementalUpdate(old_v, new_v);
|
|
111
|
-
if (old_v == 0) {
|
|
112
|
-
this->numAtCurMin_--;
|
|
113
|
-
}
|
|
117
|
+
// we can optimize further when the k values are equal
|
|
118
|
+
if (this->getLgConfigK() == src.getLgConfigK()) {
|
|
119
|
+
if (src.getTgtHllType() == target_hll_type::HLL_8) {
|
|
120
|
+
uint32_t i = 0;
|
|
121
|
+
for (const auto value: src.getHllArray()) {
|
|
122
|
+
this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
|
|
123
|
+
++i;
|
|
114
124
|
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
const uint8_t
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
125
|
+
} else if (src.getTgtHllType() == target_hll_type::HLL_6) {
|
|
126
|
+
const uint32_t src_k = 1 << src.getLgConfigK();
|
|
127
|
+
uint32_t i = 0;
|
|
128
|
+
const uint8_t* ptr = src.getHllArray().data();
|
|
129
|
+
while (i < src_k) {
|
|
130
|
+
uint8_t value = *ptr & 0x3f;
|
|
131
|
+
this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
|
|
132
|
+
++i;
|
|
133
|
+
value = *ptr++ >> 6;
|
|
134
|
+
value |= (*ptr & 0x0f) << 2;
|
|
135
|
+
this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
|
|
136
|
+
++i;
|
|
137
|
+
value = *ptr++ >> 4;
|
|
138
|
+
value |= (*ptr & 3) << 4;
|
|
139
|
+
this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
|
|
140
|
+
++i;
|
|
141
|
+
value = *ptr++ >> 2;
|
|
142
|
+
this->hllByteArr_[i] = std::max(this->hllByteArr_[i], value);
|
|
143
|
+
++i;
|
|
144
|
+
}
|
|
145
|
+
} else { // HLL_4
|
|
146
|
+
const auto& src4 = static_cast<const Hll4Array<A>&>(src);
|
|
147
|
+
uint32_t i = 0;
|
|
148
|
+
for (const auto byte: src.getHllArray()) {
|
|
149
|
+
this->hllByteArr_[i] = std::max(this->hllByteArr_[i], src4.adjustRawValue(i, byte & hll_constants::loNibbleMask));
|
|
150
|
+
++i;
|
|
151
|
+
this->hllByteArr_[i] = std::max(this->hllByteArr_[i], src4.adjustRawValue(i, byte >> 4));
|
|
152
|
+
++i;
|
|
127
153
|
}
|
|
128
154
|
}
|
|
129
|
-
} else {
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
155
|
+
} else {
|
|
156
|
+
// src_k > dst_k
|
|
157
|
+
const uint32_t dst_mask = (1 << this->getLgConfigK()) - 1;
|
|
158
|
+
// special treatment below to optimize performance
|
|
159
|
+
if (src.getTgtHllType() == target_hll_type::HLL_8) {
|
|
160
|
+
uint32_t i = 0;
|
|
161
|
+
for (const auto value: src.getHllArray()) {
|
|
162
|
+
processValue(i++, dst_mask, value);
|
|
163
|
+
}
|
|
164
|
+
} else if (src.getTgtHllType() == target_hll_type::HLL_6) {
|
|
165
|
+
const uint32_t src_k = 1 << src.getLgConfigK();
|
|
166
|
+
uint32_t i = 0;
|
|
167
|
+
const uint8_t* ptr = src.getHllArray().data();
|
|
168
|
+
while (i < src_k) {
|
|
169
|
+
uint8_t value = *ptr & 0x3f;
|
|
170
|
+
processValue(i++, dst_mask, value);
|
|
171
|
+
value = *ptr++ >> 6;
|
|
172
|
+
value |= (*ptr & 0x0f) << 2;
|
|
173
|
+
processValue(i++, dst_mask, value);
|
|
174
|
+
value = *ptr++ >> 4;
|
|
175
|
+
value |= (*ptr & 3) << 4;
|
|
176
|
+
processValue(i++, dst_mask, value);
|
|
177
|
+
value = *ptr++ >> 2;
|
|
178
|
+
processValue(i++, dst_mask, value);
|
|
179
|
+
}
|
|
180
|
+
} else { // HLL_4
|
|
181
|
+
const auto& src4 = static_cast<const Hll4Array<A>&>(src);
|
|
182
|
+
uint32_t i = 0;
|
|
183
|
+
for (const auto byte: src.getHllArray()) {
|
|
184
|
+
processValue(i, dst_mask, src4.adjustRawValue(i, byte & hll_constants::loNibbleMask));
|
|
185
|
+
++i;
|
|
186
|
+
processValue(i, dst_mask, src4.adjustRawValue(i, byte >> 4));
|
|
187
|
+
++i;
|
|
140
188
|
}
|
|
141
189
|
}
|
|
142
190
|
}
|
|
191
|
+
this->setRebuildKxqCurminFlag(true);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
template<typename A>
|
|
196
|
+
void Hll8Array<A>::processValue(uint32_t slot, uint32_t mask, uint8_t new_val) {
|
|
197
|
+
const size_t index = slot & mask;
|
|
198
|
+
this->hllByteArr_[index] = std::max(this->hllByteArr_[index], new_val);
|
|
143
199
|
}
|
|
144
200
|
|
|
145
201
|
}
|
|
@@ -31,6 +31,7 @@ template<typename A>
|
|
|
31
31
|
class Hll8Array final : public HllArray<A> {
|
|
32
32
|
public:
|
|
33
33
|
Hll8Array(uint8_t lgConfigK, bool startFullSize, const A& allocator);
|
|
34
|
+
explicit Hll8Array(const HllArray<A>& that);
|
|
34
35
|
|
|
35
36
|
virtual ~Hll8Array() = default;
|
|
36
37
|
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
|
|
@@ -48,6 +49,7 @@ class Hll8Array final : public HllArray<A> {
|
|
|
48
49
|
|
|
49
50
|
private:
|
|
50
51
|
inline void internalCouponUpdate(uint32_t coupon);
|
|
52
|
+
inline void processValue(uint32_t slot, uint32_t mask, uint8_t new_val);
|
|
51
53
|
};
|
|
52
54
|
|
|
53
55
|
}
|