datasketches 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +7 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +65 -1
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea00e444de6dc1bebc2b8cf878a250f08717d55eaa55f63f6bec28f4be2af00d
|
4
|
+
data.tar.gz: 161b9089e3b8d0dbd99cfb6cc0af463934c42ba85f4788a08306369966f28571
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '09eede1e6e4c0fe57c0116c4e8873670192fea845783687ca34890bd9358af9dd19a535774ab7dd667055cf6acd0d3913f044dcf2274e0ec092b33307250a74a'
|
7
|
+
data.tar.gz: b8bcaeb7af0d27e836f21941663229a2750922914c4f31f4ffbd6e3c3876320f9ce92916eb9730e02227b87e8f244bc08e5bc38541bf4ef4e3485203fff01942
|
data/CHANGELOG.md
CHANGED
data/LICENSE
CHANGED
@@ -284,11 +284,48 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
284
284
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
285
285
|
DEALINGS IN THE SOFTWARE.
|
286
286
|
-------------------------------------------------------------
|
287
|
-
Code Locations
|
287
|
+
Code Locations:
|
288
288
|
* https://github.com/apache/datasketches-cpp/blob/master/common/test/catch.hpp
|
289
289
|
that is adapted from the above.
|
290
290
|
|
291
291
|
|
292
|
+
=============================================================
|
293
|
+
BSD License
|
294
|
+
=============================================================
|
295
|
+
Original source code:
|
296
|
+
https://github.com/pybind/pybind11/blob/master/LICENSE
|
297
|
+
|
298
|
+
Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
|
299
|
+
|
300
|
+
Redistribution and use in source and binary forms, with or without
|
301
|
+
modification, are permitted provided that the following conditions are met:
|
302
|
+
|
303
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
304
|
+
list of conditions and the following disclaimer.
|
305
|
+
|
306
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
307
|
+
this list of conditions and the following disclaimer in the documentation
|
308
|
+
and/or other materials provided with the distribution.
|
309
|
+
|
310
|
+
3. Neither the name of the copyright holder nor the names of its contributors
|
311
|
+
may be used to endorse or promote products derived from this software
|
312
|
+
without specific prior written permission.
|
313
|
+
|
314
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
315
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
316
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
317
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
318
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
319
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
320
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
321
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
322
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
323
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
324
|
+
-------------------------------------------------------------
|
325
|
+
Code Locations:
|
326
|
+
Found only in the convenience binaries distributed from PyPI, which rely
|
327
|
+
on pybind11 code during compilation.
|
328
|
+
|
292
329
|
|
293
330
|
=============================================================
|
294
331
|
Public Domain
|
@@ -297,7 +334,7 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
297
334
|
https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
|
298
335
|
Placed in the Public Domain by Austin Appleby
|
299
336
|
|
300
|
-
Code Locations
|
337
|
+
Code Locations:
|
301
338
|
common/include/MurmurHash3.h
|
302
339
|
that is adapted from the above.
|
303
340
|
-------------------------------------------------------------
|
@@ -305,6 +342,6 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
305
342
|
* https://graphics.stanford.edu/~seander/bithacks.html
|
306
343
|
* Placed in the Public Domain by Sean Eron Anderson
|
307
344
|
|
308
|
-
Code Locations
|
345
|
+
Code Locations:
|
309
346
|
* common/include/ceiling_power_of_2.hpp
|
310
347
|
that is adapted from the above.
|
data/NOTICE
CHANGED
data/lib/datasketches/version.rb
CHANGED
@@ -35,6 +35,8 @@ set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
|
35
35
|
#set(CMAKE_VERBOSE_MAKEFILE ON)
|
36
36
|
set(CMAKE_MACOSX_RPATH ON)
|
37
37
|
|
38
|
+
set(CMAKE_CXX_STANDARD 11)
|
39
|
+
|
38
40
|
# enable compiler warnings globally
|
39
41
|
# derived from https://foonathan.net/blog/2018/10/17/cmake-warnings.html
|
40
42
|
# and https://arne-mertz.de/2018/07/cmake-properties-options/
|
@@ -284,11 +284,48 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
284
284
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
285
285
|
DEALINGS IN THE SOFTWARE.
|
286
286
|
-------------------------------------------------------------
|
287
|
-
Code Locations
|
287
|
+
Code Locations:
|
288
288
|
* https://github.com/apache/datasketches-cpp/blob/master/common/test/catch.hpp
|
289
289
|
that is adapted from the above.
|
290
290
|
|
291
291
|
|
292
|
+
=============================================================
|
293
|
+
BSD License
|
294
|
+
=============================================================
|
295
|
+
Original source code:
|
296
|
+
https://github.com/pybind/pybind11/blob/master/LICENSE
|
297
|
+
|
298
|
+
Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
|
299
|
+
|
300
|
+
Redistribution and use in source and binary forms, with or without
|
301
|
+
modification, are permitted provided that the following conditions are met:
|
302
|
+
|
303
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
304
|
+
list of conditions and the following disclaimer.
|
305
|
+
|
306
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
307
|
+
this list of conditions and the following disclaimer in the documentation
|
308
|
+
and/or other materials provided with the distribution.
|
309
|
+
|
310
|
+
3. Neither the name of the copyright holder nor the names of its contributors
|
311
|
+
may be used to endorse or promote products derived from this software
|
312
|
+
without specific prior written permission.
|
313
|
+
|
314
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
315
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
316
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
317
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
318
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
319
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
320
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
321
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
322
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
323
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
324
|
+
-------------------------------------------------------------
|
325
|
+
Code Locations:
|
326
|
+
Found only in the convenience binaries distributed from PyPI, which rely
|
327
|
+
on pybind11 code during compilation.
|
328
|
+
|
292
329
|
|
293
330
|
=============================================================
|
294
331
|
Public Domain
|
@@ -297,7 +334,7 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
297
334
|
https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
|
298
335
|
Placed in the Public Domain by Austin Appleby
|
299
336
|
|
300
|
-
Code Locations
|
337
|
+
Code Locations:
|
301
338
|
common/include/MurmurHash3.h
|
302
339
|
that is adapted from the above.
|
303
340
|
-------------------------------------------------------------
|
@@ -305,7 +342,7 @@ APPENDIX B: Additional licenses relevant to this product.
|
|
305
342
|
* https://graphics.stanford.edu/~seander/bithacks.html
|
306
343
|
* Placed in the Public Domain by Sean Eron Anderson
|
307
344
|
|
308
|
-
Code Locations
|
345
|
+
Code Locations:
|
309
346
|
* common/include/ceiling_power_of_2.hpp
|
310
347
|
that is adapted from the above.
|
311
348
|
|
@@ -29,6 +29,8 @@ namespace datasketches {
|
|
29
29
|
|
30
30
|
static const uint64_t DEFAULT_SEED = 9001;
|
31
31
|
|
32
|
+
enum resize_factor { X1 = 0, X2, X4, X8 };
|
33
|
+
|
32
34
|
template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
|
33
35
|
template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
|
34
36
|
|
@@ -26,9 +26,16 @@
|
|
26
26
|
|
27
27
|
namespace datasketches {
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
namespace cpc_constants {
|
30
|
+
const uint8_t MIN_LG_K = 4;
|
31
|
+
const uint8_t MAX_LG_K = 26;
|
32
|
+
const uint8_t DEFAULT_LG_K = 11;
|
33
|
+
}
|
34
|
+
|
35
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
36
|
+
static const uint8_t CPC_MIN_LG_K = cpc_constants::MIN_LG_K;
|
37
|
+
static const uint8_t CPC_MAX_LG_K = cpc_constants::MAX_LG_K;
|
38
|
+
static const uint8_t CPC_DEFAULT_LG_K = cpc_constants::DEFAULT_LG_K;
|
32
39
|
|
33
40
|
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
34
41
|
template<typename A> using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
|
@@ -67,7 +67,7 @@ public:
|
|
67
67
|
* @param lg_k base 2 logarithm of the number of bins in the sketch
|
68
68
|
* @param seed for hash function
|
69
69
|
*/
|
70
|
-
explicit cpc_sketch_alloc(uint8_t lg_k =
|
70
|
+
explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
71
71
|
|
72
72
|
using allocator_type = A;
|
73
73
|
A get_allocator() const;
|
@@ -45,7 +45,7 @@ public:
|
|
45
45
|
* @param lg_k base 2 logarithm of the number of bins in the sketch
|
46
46
|
* @param seed for hash function
|
47
47
|
*/
|
48
|
-
explicit cpc_union_alloc(uint8_t lg_k =
|
48
|
+
explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
49
49
|
|
50
50
|
cpc_union_alloc(const cpc_union_alloc<A>& other);
|
51
51
|
cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
|
@@ -350,7 +350,7 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
|
|
350
350
|
check_serial_version(serial_version);
|
351
351
|
check_family_id(family_id);
|
352
352
|
check_size(lg_cur_size, lg_max_size);
|
353
|
-
ensure_minimum_memory(size,
|
353
|
+
ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
|
354
354
|
|
355
355
|
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
|
356
356
|
if (!is_empty) {
|
@@ -153,6 +153,10 @@ template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
|
|
153
153
|
template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
|
154
154
|
template<typename A> using vector_d = std::vector<double, AllocD<A>>;
|
155
155
|
|
156
|
+
namespace kll_constants {
|
157
|
+
const uint16_t DEFAULT_K = 200;
|
158
|
+
}
|
159
|
+
|
156
160
|
template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
|
157
161
|
class kll_sketch {
|
158
162
|
public:
|
@@ -160,11 +164,12 @@ class kll_sketch {
|
|
160
164
|
using comparator = C;
|
161
165
|
|
162
166
|
static const uint8_t DEFAULT_M = 8;
|
163
|
-
|
167
|
+
// TODO: Redundant and deprecated. Will be remove din next major version.
|
168
|
+
static const uint16_t DEFAULT_K = kll_constants::DEFAULT_K;
|
164
169
|
static const uint16_t MIN_K = DEFAULT_M;
|
165
170
|
static const uint16_t MAX_K = (1 << 16) - 1;
|
166
171
|
|
167
|
-
explicit kll_sketch(uint16_t k = DEFAULT_K, const A& allocator = A());
|
172
|
+
explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const A& allocator = A());
|
168
173
|
kll_sketch(const kll_sketch& other);
|
169
174
|
kll_sketch(kll_sketch&& other) noexcept;
|
170
175
|
~kll_sketch();
|
@@ -575,7 +575,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
|
|
575
575
|
check_preamble_ints(preamble_ints, flags_byte);
|
576
576
|
check_serial_version(serial_version);
|
577
577
|
check_family_id(family_id);
|
578
|
-
ensure_minimum_memory(size,
|
578
|
+
ensure_minimum_memory(size, preamble_ints * sizeof(uint32_t));
|
579
579
|
|
580
580
|
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
|
581
581
|
if (is_empty) return kll_sketch<T, C, S, A>(k, allocator);
|
@@ -279,6 +279,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
279
279
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
280
280
|
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
281
281
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
282
|
+
REQUIRE(s.tellg() == s.tellp());
|
282
283
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
283
284
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
284
285
|
REQUIRE(sketch2.get_n() == sketch.get_n());
|
@@ -304,7 +305,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
304
305
|
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
|
305
306
|
}
|
306
307
|
|
307
|
-
SECTION("serialize deserialize one item") {
|
308
|
+
SECTION("stream serialize deserialize one item") {
|
308
309
|
kll_float_sketch sketch(200, 0);
|
309
310
|
sketch.update(1.0f);
|
310
311
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
@@ -324,6 +325,24 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
324
325
|
REQUIRE(sketch2.get_rank(2) == 1.0);
|
325
326
|
}
|
326
327
|
|
328
|
+
SECTION("bytes serialize deserialize one item") {
|
329
|
+
kll_float_sketch sketch(200, 0);
|
330
|
+
sketch.update(1.0f);
|
331
|
+
auto bytes = sketch.serialize();
|
332
|
+
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
333
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
334
|
+
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
335
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
336
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
337
|
+
REQUIRE(sketch2.get_n() == 1);
|
338
|
+
REQUIRE(sketch2.get_num_retained() == 1);
|
339
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
340
|
+
REQUIRE(sketch2.get_max_value() == 1.0);
|
341
|
+
REQUIRE(sketch2.get_quantile(0.5) == 1.0);
|
342
|
+
REQUIRE(sketch2.get_rank(1) == 0.0);
|
343
|
+
REQUIRE(sketch2.get_rank(2) == 1.0);
|
344
|
+
}
|
345
|
+
|
327
346
|
SECTION("deserialize one item v1") {
|
328
347
|
std::ifstream is;
|
329
348
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
@@ -337,6 +356,42 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
337
356
|
REQUIRE(sketch.get_max_value() == 1.0);
|
338
357
|
}
|
339
358
|
|
359
|
+
SECTION("stream serialize deserialize three items") {
|
360
|
+
kll_float_sketch sketch(200, 0);
|
361
|
+
sketch.update(1.0f);
|
362
|
+
sketch.update(2.0f);
|
363
|
+
sketch.update(3.0f);
|
364
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
365
|
+
sketch.serialize(s);
|
366
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
367
|
+
auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
|
368
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
369
|
+
REQUIRE(s.tellg() == s.tellp());
|
370
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
371
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
372
|
+
REQUIRE(sketch2.get_n() == 3);
|
373
|
+
REQUIRE(sketch2.get_num_retained() == 3);
|
374
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
375
|
+
REQUIRE(sketch2.get_max_value() == 3.0);
|
376
|
+
}
|
377
|
+
|
378
|
+
SECTION("bytes serialize deserialize three items") {
|
379
|
+
kll_float_sketch sketch(200, 0);
|
380
|
+
sketch.update(1.0f);
|
381
|
+
sketch.update(2.0f);
|
382
|
+
sketch.update(3.0f);
|
383
|
+
auto bytes = sketch.serialize();
|
384
|
+
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
385
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
386
|
+
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
387
|
+
REQUIRE_FALSE(sketch2.is_empty());
|
388
|
+
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
389
|
+
REQUIRE(sketch2.get_n() == 3);
|
390
|
+
REQUIRE(sketch2.get_num_retained() == 3);
|
391
|
+
REQUIRE(sketch2.get_min_value() == 1.0);
|
392
|
+
REQUIRE(sketch2.get_max_value() == 3.0);
|
393
|
+
}
|
394
|
+
|
340
395
|
SECTION("stream serialize deserialize many floats") {
|
341
396
|
kll_float_sketch sketch(200, 0);
|
342
397
|
const int n = 1000;
|
@@ -702,6 +757,15 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
702
757
|
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
|
703
758
|
}
|
704
759
|
|
760
|
+
SECTION("issue #236") {
|
761
|
+
kll_sketch<int8_t> kll;
|
762
|
+
kll.update(1);
|
763
|
+
kll.update(2);
|
764
|
+
kll.update(3);
|
765
|
+
auto blob = kll.serialize();
|
766
|
+
auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
|
767
|
+
}
|
768
|
+
|
705
769
|
// cleanup
|
706
770
|
if (test_allocator_total_bytes != 0) {
|
707
771
|
REQUIRE(test_allocator_total_bytes == 0);
|
@@ -15,16 +15,20 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
|
18
|
+
find_package(Python3 COMPONENTS Interpreter Development)
|
19
|
+
|
20
|
+
# only Windows+MSVC seems to have trouble locating pybind11
|
19
21
|
if (MSVC)
|
20
|
-
|
21
|
-
|
22
|
-
|
22
|
+
execute_process(COMMAND cmd.exe /c ${CMAKE_CURRENT_SOURCE_DIR}/pybind11Path.cmd "${Python3_EXECUTABLE}"
|
23
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
24
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
25
|
+
OUTPUT_VARIABLE EXTRA_PACKAGE_PATH)
|
26
|
+
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${EXTRA_PACKAGE_PATH})
|
23
27
|
endif()
|
24
28
|
|
25
|
-
|
29
|
+
find_package(pybind11 CONFIG REQUIRED)
|
26
30
|
|
27
|
-
pybind11_add_module(python MODULE EXCLUDE_FROM_ALL
|
31
|
+
pybind11_add_module(python MODULE EXCLUDE_FROM_ALL THIN_LTO)
|
28
32
|
|
29
33
|
target_link_libraries(python
|
30
34
|
PRIVATE
|
@@ -1,76 +1,57 @@
|
|
1
|
-
|
1
|
+
<img src="https://raw.githubusercontent.com/apache/datasketches-website/master/logos/svg/datasketches-HorizontalColor-TM.svg" width="75%" alt="Apache DataSketchs Logo">
|
2
2
|
|
3
|
-
|
3
|
+
# The Apache DataSketches Library for Python
|
4
4
|
|
5
|
-
|
6
|
-
from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
|
5
|
+
This is the official version of the [Apache DataSketches](https://datasketches.apache.org) Python library.
|
7
6
|
|
8
|
-
|
7
|
+
In the analysis of big data there are often problem queries that don’t scale because they require huge compute resources and time to generate exact results. Examples include count distinct, quantiles, most-frequent items, joins, matrix computations, and graph analysis.
|
9
8
|
|
10
|
-
If
|
11
|
-
```pip install git+https://github.com/apache/datasketches-cpp.git```
|
9
|
+
If approximate results are acceptable, there is a class of specialized algorithms, called streaming algorithms, or sketches that can produce results orders-of magnitude faster and with mathematically proven error bounds. For interactive queries there may not be other viable alternatives, and in the case of real-time analysis, sketches are the only known solution.
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
### Building
|
16
|
-
|
17
|
-
When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
|
18
|
-
```
|
19
|
-
git clone --recursive https://github.com/apache/datasketches-cpp.git
|
20
|
-
cd datasketches-cpp
|
21
|
-
python -m pip install --upgrade pip setuptools wheel numpy
|
22
|
-
python setup.py build
|
23
|
-
```
|
11
|
+
This package provides a variety of sketches as described below. Wherever a specific type of sketch exists in Apache DataSketches packages for other languages, the sketches will be portable between languages (for platforms with the same endianness).
|
24
12
|
|
25
|
-
|
13
|
+
## Building and Installation
|
26
14
|
|
27
|
-
|
15
|
+
Once cloned, the library can be installed by running `python -m pip install .` in the project root directory, which will also install the necessary dependencies, namely numpy and [pybind11[global]](https://github.com/pybind/pybind11).
|
28
16
|
|
29
|
-
|
30
|
-
line of the build command with `python setup.py install`.
|
17
|
+
If you prefer to call the `setup.py` build script directly, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
|
31
18
|
|
32
|
-
|
33
|
-
|
34
|
-
The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
|
35
|
-
```
|
36
|
-
python -m pip install --upgrade pip setuptools wheel numpy tox
|
37
|
-
tox
|
38
|
-
```
|
19
|
+
The library is also available from PyPI via `python -m pip install datasketches`.
|
39
20
|
|
40
21
|
## Usage
|
41
22
|
|
42
|
-
Having installed the library, loading the Apache Datasketches
|
23
|
+
Having installed the library, loading the Apache Datasketches Library in Python is simple: `import datasketches`.
|
43
24
|
|
44
25
|
## Available Sketch Classes
|
45
26
|
|
46
27
|
- KLL (Absolute Error Quantiles)
|
47
|
-
|
48
|
-
|
28
|
+
- `kll_ints_sketch`
|
29
|
+
- `kll_floats_sketch`
|
49
30
|
- REQ (Relative Error Quantiles)
|
50
|
-
|
51
|
-
|
31
|
+
- `req_ints_sketch`
|
32
|
+
- `req_floats_sketch`
|
52
33
|
- Frequent Items
|
53
|
-
|
54
|
-
|
34
|
+
- `frequent_strings_sketch`
|
35
|
+
- Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
|
55
36
|
- Theta
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
37
|
+
- `update_theta_sketch`
|
38
|
+
- `compact_theta_sketch` (cannot be instantiated directly)
|
39
|
+
- `theta_union`
|
40
|
+
- `theta_intersection`
|
41
|
+
- `theta_a_not_b`
|
61
42
|
- HLL
|
62
|
-
|
63
|
-
|
64
|
-
|
43
|
+
- `hll_sketch`
|
44
|
+
- `hll_union`
|
45
|
+
- Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
|
65
46
|
- CPC
|
66
|
-
|
67
|
-
|
47
|
+
- `cpc_sketch`
|
48
|
+
- `cpc_union`
|
68
49
|
- VarOpt Sampling
|
69
|
-
|
70
|
-
|
50
|
+
- `var_opt_sketch`
|
51
|
+
- `var_opt_union`
|
71
52
|
- Vector of KLL
|
72
|
-
|
73
|
-
|
53
|
+
- `vector_of_kll_ints_sketches`
|
54
|
+
- `vector_of_kll_floats_sketches`
|
74
55
|
|
75
56
|
## Known Differences from C++
|
76
57
|
|
@@ -79,3 +60,22 @@ The Python API largely mirrors the C++ API, with a few minor exceptions: The pri
|
|
79
60
|
The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
|
80
61
|
|
81
62
|
We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
|
63
|
+
|
64
|
+
## Developer Instructions
|
65
|
+
|
66
|
+
The only developer-specific instructions relate to running unit tests.
|
67
|
+
|
68
|
+
### Unit tests
|
69
|
+
|
70
|
+
The Python unit tests are run with `tox`. To ensure you have all the needed package, from the package base directory run:
|
71
|
+
|
72
|
+
```bash
|
73
|
+
python -m pip install --upgrade tox
|
74
|
+
tox
|
75
|
+
```
|
76
|
+
|
77
|
+
## License
|
78
|
+
|
79
|
+
The Apache DataSketches Library is distrubted under an Apache 2.0 License.
|
80
|
+
|
81
|
+
There may be precompiled binaries provided as a convenience and distributed through PyPI via [https://pypi.org/project/datasketches/] contain compiled code from [pybind11](https://github.com/pybind/pybind11), which is distributed under a BSD license.
|
@@ -53,7 +53,7 @@ void init_cpc(py::module &m) {
|
|
53
53
|
using namespace datasketches;
|
54
54
|
|
55
55
|
py::class_<cpc_sketch>(m, "cpc_sketch")
|
56
|
-
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=
|
56
|
+
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=cpc_constants::DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
|
57
57
|
.def(py::init<const cpc_sketch&>())
|
58
58
|
.def("__str__", &cpc_sketch::to_string,
|
59
59
|
"Produces a string summary of the sketch")
|
@@ -116,7 +116,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
116
116
|
using namespace datasketches;
|
117
117
|
|
118
118
|
py::class_<kll_sketch<T>>(m, name)
|
119
|
-
.def(py::init<uint16_t>(), py::arg("k")=
|
119
|
+
.def(py::init<uint16_t>(), py::arg("k")=kll_constants::DEFAULT_K)
|
120
120
|
.def(py::init<const kll_sketch<T>&>())
|
121
121
|
.def("update", (void (kll_sketch<T>::*)(const T&)) &kll_sketch<T>::update, py::arg("item"),
|
122
122
|
"Updates the sketch with the given value")
|
@@ -103,7 +103,7 @@ void init_theta(py::module &m) {
|
|
103
103
|
|
104
104
|
py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
|
105
105
|
.def(py::init(&dspy::update_theta_sketch_factory),
|
106
|
-
py::arg("lg_k")=
|
106
|
+
py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
|
107
107
|
.def(py::init<const update_theta_sketch&>())
|
108
108
|
.def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"),
|
109
109
|
"Updates the sketch with the given integral value")
|
@@ -127,7 +127,7 @@ void init_theta(py::module &m) {
|
|
127
127
|
|
128
128
|
py::class_<theta_union>(m, "theta_union")
|
129
129
|
.def(py::init(&dspy::theta_union_factory),
|
130
|
-
py::arg("lg_k")=
|
130
|
+
py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
|
131
131
|
.def("update", &theta_union::update<const theta_sketch&>, py::arg("sketch"),
|
132
132
|
"Updates the union with the given sketch")
|
133
133
|
.def("get_result", &theta_union::get_result, py::arg("ordered")=true,
|
@@ -29,14 +29,20 @@ namespace py = pybind11;
|
|
29
29
|
|
30
30
|
namespace datasketches {
|
31
31
|
|
32
|
+
namespace vector_of_kll_constants {
|
33
|
+
static const uint32_t DEFAULT_K = kll_constants::DEFAULT_K;
|
34
|
+
static const uint32_t DEFAULT_D = 1;
|
35
|
+
}
|
36
|
+
|
32
37
|
// Wrapper class for Numpy compatibility
|
33
38
|
template <typename T, typename C = std::less<T>, typename S = serde<T>>
|
34
39
|
class vector_of_kll_sketches {
|
35
40
|
public:
|
36
|
-
|
37
|
-
static const uint32_t
|
41
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
42
|
+
static const uint32_t DEFAULT_K = vector_of_kll_constants::DEFAULT_K;
|
43
|
+
static const uint32_t DEFAULT_D = vector_of_kll_constants::DEFAULT_D;
|
38
44
|
|
39
|
-
explicit vector_of_kll_sketches(uint32_t k = DEFAULT_K, uint32_t d = DEFAULT_D);
|
45
|
+
explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
|
40
46
|
vector_of_kll_sketches(const vector_of_kll_sketches& other);
|
41
47
|
vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
|
42
48
|
vector_of_kll_sketches<T,C,S>& operator=(const vector_of_kll_sketches& other);
|
@@ -432,8 +438,8 @@ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
|
|
432
438
|
using namespace datasketches;
|
433
439
|
|
434
440
|
py::class_<vector_of_kll_sketches<T>>(m, name)
|
435
|
-
.def(py::init<uint32_t, uint32_t>(), py::arg("k")=
|
436
|
-
py::arg("d")=
|
441
|
+
.def(py::init<uint32_t, uint32_t>(), py::arg("k")=vector_of_kll_constants::DEFAULT_K,
|
442
|
+
py::arg("d")=vector_of_kll_constants::DEFAULT_D)
|
437
443
|
.def(py::init<const vector_of_kll_sketches<T>&>())
|
438
444
|
// allow user to retrieve k or d, in case it's instantiated w/ defaults
|
439
445
|
.def("get_k", &vector_of_kll_sketches<T>::get_k,
|
@@ -30,10 +30,10 @@ class KllTest(unittest.TestCase):
|
|
30
30
|
kll.update(0.0)
|
31
31
|
|
32
32
|
# 0 should be near the median
|
33
|
-
self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.
|
33
|
+
self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.035)
|
34
34
|
|
35
35
|
# the median should be near 0
|
36
|
-
self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.
|
36
|
+
self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.035)
|
37
37
|
|
38
38
|
# we also track the min/max independently from the rest of the data
|
39
39
|
# which lets us know the full observed data range
|
@@ -30,10 +30,10 @@ class reqTest(unittest.TestCase):
|
|
30
30
|
req.update(0.0)
|
31
31
|
|
32
32
|
# 0 should be near the median
|
33
|
-
self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.
|
33
|
+
self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.045)
|
34
34
|
|
35
35
|
# the median should be near 0
|
36
|
-
self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.
|
36
|
+
self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.045)
|
37
37
|
|
38
38
|
# we also track the min/max independently from the rest of the data
|
39
39
|
# which lets us know the full observed data range
|
@@ -39,9 +39,9 @@ class VectorOfKllSketchesTest(unittest.TestCase):
|
|
39
39
|
kll.update(dat)
|
40
40
|
|
41
41
|
# 0 should be near the median
|
42
|
-
np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.
|
42
|
+
np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.035)
|
43
43
|
# the median should be near 0
|
44
|
-
np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.
|
44
|
+
np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.035)
|
45
45
|
# we also track the min/max independently from the rest of the data
|
46
46
|
# which lets us know the full observed data range
|
47
47
|
np.testing.assert_allclose(kll.get_min_values(), smin)
|
@@ -118,9 +118,9 @@ class VectorOfKllSketchesTest(unittest.TestCase):
|
|
118
118
|
kll.update(dat)
|
119
119
|
|
120
120
|
# 0 should be near the median
|
121
|
-
np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.
|
121
|
+
np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.035)
|
122
122
|
# the median should be near 0
|
123
|
-
np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.
|
123
|
+
np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.035)
|
124
124
|
# we also track the min/max independently from the rest of the data
|
125
125
|
# which lets us know the full observed data range
|
126
126
|
np.testing.assert_allclose(kll.get_min_values(), smin)
|
@@ -51,18 +51,23 @@ struct subset_summary {
|
|
51
51
|
double total_sketch_weight;
|
52
52
|
};
|
53
53
|
|
54
|
-
enum resize_factor { X1 = 0, X2, X4, X8 };
|
55
|
-
|
56
54
|
template <typename T, typename S, typename A> class var_opt_union; // forward declaration
|
57
55
|
|
56
|
+
namespace var_opt_constants {
|
57
|
+
const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
|
58
|
+
const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
59
|
+
}
|
60
|
+
|
58
61
|
template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
|
59
62
|
class var_opt_sketch {
|
60
63
|
|
61
64
|
public:
|
62
|
-
static const resize_factor DEFAULT_RESIZE_FACTOR =
|
63
|
-
static const uint32_t MAX_K =
|
65
|
+
static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR;
|
66
|
+
static const uint32_t MAX_K = var_opt_constants::MAX_K;
|
64
67
|
|
65
|
-
explicit var_opt_sketch(uint32_t k,
|
68
|
+
explicit var_opt_sketch(uint32_t k,
|
69
|
+
resize_factor rf = var_opt_constants::DEFAULT_RESIZE_FACTOR,
|
70
|
+
const A& allocator = A());
|
66
71
|
var_opt_sketch(const var_opt_sketch& other);
|
67
72
|
var_opt_sketch(var_opt_sketch&& other) noexcept;
|
68
73
|
|
@@ -128,7 +128,7 @@ var_opt_sketch<T,S,A>::var_opt_sketch(T* data, double* weights, size_t len,
|
|
128
128
|
r_(r_count),
|
129
129
|
n_(n),
|
130
130
|
total_wt_r_(total_wt_r),
|
131
|
-
rf_(DEFAULT_RESIZE_FACTOR),
|
131
|
+
rf_(var_opt_constants::DEFAULT_RESIZE_FACTOR),
|
132
132
|
curr_items_alloc_(len),
|
133
133
|
filled_data_(n > k),
|
134
134
|
allocator_(allocator),
|
@@ -49,8 +49,9 @@ class CMakeBuild(build_ext):
|
|
49
49
|
os.path.dirname(self.get_ext_fullpath(ext.name)))
|
50
50
|
cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir]
|
51
51
|
cmake_args += ['-DWITH_PYTHON=True']
|
52
|
+
cmake_args += ['-DCMAKE_CXX_STANDARD=11']
|
52
53
|
# ensure we use a consistent python version
|
53
|
-
cmake_args += ['-
|
54
|
+
cmake_args += ['-DPython3_EXECUTABLE=' + sys.executable]
|
54
55
|
cfg = 'Debug' if self.debug else 'Release'
|
55
56
|
build_args = ['--config', cfg]
|
56
57
|
|
@@ -59,7 +60,8 @@ class CMakeBuild(build_ext):
|
|
59
60
|
cfg.upper(),
|
60
61
|
extdir)]
|
61
62
|
if sys.maxsize > 2**32:
|
62
|
-
cmake_args += ['-
|
63
|
+
cmake_args += ['-T', 'host=x64']
|
64
|
+
cmake_args += ['-DCMAKE_GENERATOR_PLATFORM=x64']
|
63
65
|
build_args += ['--', '/m']
|
64
66
|
else:
|
65
67
|
cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
|
@@ -74,23 +76,24 @@ class CMakeBuild(build_ext):
|
|
74
76
|
subprocess.check_call(['cmake', ext.sourcedir] + cmake_args,
|
75
77
|
cwd=self.build_temp, env=env)
|
76
78
|
subprocess.check_call(['cmake', '--build', '.', '--target', 'python'] + build_args,
|
77
|
-
cwd=self.build_temp)
|
79
|
+
cwd=self.build_temp, env=env)
|
78
80
|
print() # add an empty line to pretty print
|
79
81
|
|
80
82
|
setup(
|
81
83
|
name='datasketches',
|
82
|
-
version='3.
|
83
|
-
author='Apache
|
84
|
+
version='3.2.0.1',
|
85
|
+
author='Apache Software Foundation',
|
84
86
|
author_email='dev@datasketches.apache.org',
|
85
|
-
description='
|
87
|
+
description='The Apache DataSketches Library for Python',
|
86
88
|
license='Apache License 2.0',
|
87
89
|
url='http://datasketches.apache.org',
|
88
90
|
long_description=open('python/README.md').read(),
|
91
|
+
long_description_content_type='text/markdown',
|
89
92
|
packages=find_packages('python'), # python pacakges only in this dir
|
90
93
|
package_dir={'':'python'},
|
91
94
|
# may need to add all source paths for sdist packages w/o MANIFEST.in
|
92
95
|
ext_modules=[CMakeExtension('datasketches')],
|
93
96
|
cmdclass={'build_ext': CMakeBuild},
|
94
|
-
|
97
|
+
install_requires=['numpy'],
|
95
98
|
zip_safe=False
|
96
99
|
)
|
@@ -21,14 +21,19 @@
|
|
21
21
|
#define THETA_CONSTANTS_HPP_
|
22
22
|
|
23
23
|
#include <climits>
|
24
|
+
#include "common_defs.hpp"
|
24
25
|
|
25
26
|
namespace datasketches {
|
26
27
|
|
27
28
|
namespace theta_constants {
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
using resize_factor = datasketches::resize_factor;
|
30
|
+
//enum resize_factor { X1, X2, X4, X8 };
|
31
|
+
const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
|
32
|
+
const uint8_t MIN_LG_K = 5;
|
33
|
+
const uint8_t MAX_LG_K = 26;
|
34
|
+
|
35
|
+
const uint8_t DEFAULT_LG_K = 12;
|
36
|
+
const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
|
32
37
|
}
|
33
38
|
|
34
39
|
} /* namespace datasketches */
|
@@ -94,11 +94,14 @@ struct theta_update_sketch_base {
|
|
94
94
|
template<typename Derived, typename Allocator>
|
95
95
|
class theta_base_builder {
|
96
96
|
public:
|
97
|
+
// TODO: Redundant and deprecated. Will be removed in next major verison release.
|
97
98
|
using resize_factor = theta_constants::resize_factor;
|
98
99
|
static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
|
99
100
|
static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
|
100
|
-
|
101
|
-
|
101
|
+
// TODO: The following defaults are redundant and deprecated. Will be removed in the
|
102
|
+
// next major version release
|
103
|
+
static const uint8_t DEFAULT_LG_K = theta_constants::DEFAULT_LG_K;
|
104
|
+
static const resize_factor DEFAULT_RESIZE_FACTOR = theta_constants::DEFAULT_RESIZE_FACTOR;
|
102
105
|
|
103
106
|
/**
|
104
107
|
* Creates and instance of the builder with default parameters.
|
@@ -271,7 +271,11 @@ void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, siz
|
|
271
271
|
|
272
272
|
template<typename Derived, typename Allocator>
|
273
273
|
theta_base_builder<Derived, Allocator>::theta_base_builder(const Allocator& allocator):
|
274
|
-
allocator_(allocator),
|
274
|
+
allocator_(allocator),
|
275
|
+
lg_k_(theta_constants::DEFAULT_LG_K),
|
276
|
+
rf_(theta_constants::DEFAULT_RESIZE_FACTOR),
|
277
|
+
p_(1),
|
278
|
+
seed_(DEFAULT_SEED) {}
|
275
279
|
|
276
280
|
template<typename Derived, typename Allocator>
|
277
281
|
Derived& theta_base_builder<Derived, Allocator>::set_lg_k(uint8_t lg_k) {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datasketches
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -177,6 +177,7 @@ files:
|
|
177
177
|
- vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb
|
178
178
|
- vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb
|
179
179
|
- vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb
|
180
|
+
- vendor/datasketches-cpp/python/pybind11Path.cmd
|
180
181
|
- vendor/datasketches-cpp/python/src/__init__.py
|
181
182
|
- vendor/datasketches-cpp/python/src/cpc_wrapper.cpp
|
182
183
|
- vendor/datasketches-cpp/python/src/datasketches.cpp
|