datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
|
@@ -41,7 +41,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
41
41
|
if (sketch.is_empty()) return;
|
|
42
42
|
if (sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
|
|
43
43
|
table_.is_empty_ = false;
|
|
44
|
-
|
|
44
|
+
union_theta_ = std::min(union_theta_, sketch.get_theta64());
|
|
45
45
|
for (auto& entry: sketch) {
|
|
46
46
|
const uint64_t hash = EK()(entry);
|
|
47
47
|
if (hash < union_theta_ && hash < table_.theta_) {
|
|
@@ -55,7 +55,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
55
55
|
if (sketch.is_ordered()) break; // early stop
|
|
56
56
|
}
|
|
57
57
|
}
|
|
58
|
-
|
|
58
|
+
union_theta_ = std::min(union_theta_, table_.theta_);
|
|
59
59
|
}
|
|
60
60
|
|
|
61
61
|
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
|
@@ -65,16 +65,16 @@ CS theta_union_base<EN, EK, P, S, CS, A>::get_result(bool ordered) const {
|
|
|
65
65
|
entries.reserve(table_.num_entries_);
|
|
66
66
|
uint64_t theta = std::min(union_theta_, table_.theta_);
|
|
67
67
|
const uint32_t nominal_num = 1 << table_.lg_nom_size_;
|
|
68
|
-
if (union_theta_ >=
|
|
68
|
+
if (union_theta_ >= table_.theta_) {
|
|
69
69
|
std::copy_if(table_.begin(), table_.end(), std::back_inserter(entries), key_not_zero<EN, EK>());
|
|
70
70
|
} else {
|
|
71
71
|
std::copy_if(table_.begin(), table_.end(), std::back_inserter(entries), key_not_zero_less_than<uint64_t, EN, EK>(theta));
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
72
|
+
}
|
|
73
|
+
if (entries.size() > nominal_num) {
|
|
74
|
+
std::nth_element(entries.begin(), entries.begin() + nominal_num, entries.end(), comparator());
|
|
75
|
+
theta = EK()(entries[nominal_num]);
|
|
76
|
+
entries.erase(entries.begin() + nominal_num, entries.end());
|
|
77
|
+
entries.shrink_to_fit();
|
|
78
78
|
}
|
|
79
79
|
if (ordered) std::sort(entries.begin(), entries.end(), comparator());
|
|
80
80
|
return CS(table_.is_empty_, ordered, compute_seed_hash(table_.seed_), theta, std::move(entries));
|
|
@@ -23,8 +23,8 @@
|
|
|
23
23
|
#include <vector>
|
|
24
24
|
#include <climits>
|
|
25
25
|
#include <cmath>
|
|
26
|
+
#include <iterator>
|
|
26
27
|
|
|
27
|
-
#include "common_defs.hpp"
|
|
28
28
|
#include "MurmurHash3.h"
|
|
29
29
|
#include "theta_comparators.hpp"
|
|
30
30
|
#include "theta_constants.hpp"
|
|
@@ -185,8 +185,14 @@ static inline uint64_t compute_hash(const void* data, size_t length, uint64_t se
|
|
|
185
185
|
// iterators
|
|
186
186
|
|
|
187
187
|
template<typename Entry, typename ExtractKey>
|
|
188
|
-
class theta_iterator
|
|
188
|
+
class theta_iterator {
|
|
189
189
|
public:
|
|
190
|
+
using iterator_category = std::input_iterator_tag;
|
|
191
|
+
using value_type = Entry;
|
|
192
|
+
using difference_type = std::ptrdiff_t;
|
|
193
|
+
using pointer = Entry*;
|
|
194
|
+
using reference = Entry&;
|
|
195
|
+
|
|
190
196
|
theta_iterator(Entry* entries, uint32_t size, uint32_t index);
|
|
191
197
|
theta_iterator& operator++();
|
|
192
198
|
theta_iterator operator++(int);
|
|
@@ -201,14 +207,20 @@ private:
|
|
|
201
207
|
};
|
|
202
208
|
|
|
203
209
|
template<typename Entry, typename ExtractKey>
|
|
204
|
-
class theta_const_iterator
|
|
210
|
+
class theta_const_iterator {
|
|
205
211
|
public:
|
|
212
|
+
using iterator_category = std::input_iterator_tag;
|
|
213
|
+
using value_type = const Entry;
|
|
214
|
+
using difference_type = std::ptrdiff_t;
|
|
215
|
+
using pointer = const Entry*;
|
|
216
|
+
using reference = const Entry&;
|
|
217
|
+
|
|
206
218
|
theta_const_iterator(const Entry* entries, uint32_t size, uint32_t index);
|
|
207
219
|
theta_const_iterator& operator++();
|
|
208
220
|
theta_const_iterator operator++(int);
|
|
209
221
|
bool operator==(const theta_const_iterator& other) const;
|
|
210
222
|
bool operator!=(const theta_const_iterator& other) const;
|
|
211
|
-
|
|
223
|
+
reference operator*() const;
|
|
212
224
|
|
|
213
225
|
private:
|
|
214
226
|
const Entry* entries_;
|
|
@@ -188,7 +188,7 @@ auto theta_update_sketch_base<EN, EK, A>::begin() const -> iterator {
|
|
|
188
188
|
|
|
189
189
|
template<typename EN, typename EK, typename A>
|
|
190
190
|
auto theta_update_sketch_base<EN, EK, A>::end() const -> iterator {
|
|
191
|
-
return
|
|
191
|
+
return entries_ + (1ULL << lg_cur_size_);
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
template<typename EN, typename EK, typename A>
|
|
@@ -382,7 +382,7 @@ bool theta_iterator<Entry, ExtractKey>::operator==(const theta_iterator& other)
|
|
|
382
382
|
}
|
|
383
383
|
|
|
384
384
|
template<typename Entry, typename ExtractKey>
|
|
385
|
-
auto theta_iterator<Entry, ExtractKey>::operator*() const ->
|
|
385
|
+
auto theta_iterator<Entry, ExtractKey>::operator*() const -> reference {
|
|
386
386
|
return entries_[index_];
|
|
387
387
|
}
|
|
388
388
|
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <bit_packing.hpp>
|
|
22
|
+
|
|
23
|
+
namespace datasketches {
|
|
24
|
+
|
|
25
|
+
// for every number of bits from 1 to 63
|
|
26
|
+
// generate pseudo-random data, pack, unpack and compare
|
|
27
|
+
|
|
28
|
+
// inverse golden ratio (0.618.. of max uint64_t)
|
|
29
|
+
static const uint64_t IGOLDEN64 = 0x9e3779b97f4a7c13ULL;
|
|
30
|
+
|
|
31
|
+
TEST_CASE("pack unpack bits") {
|
|
32
|
+
for (uint8_t bits = 1; bits <= 63; ++bits) {
|
|
33
|
+
const uint64_t mask = (1ULL << bits) - 1;
|
|
34
|
+
std::vector<uint64_t> input(8, 0);
|
|
35
|
+
const uint64_t igolden64 = IGOLDEN64;
|
|
36
|
+
uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
|
|
37
|
+
for (int i = 0; i < 8; ++i) {
|
|
38
|
+
input[i] = value & mask;
|
|
39
|
+
value += igolden64;
|
|
40
|
+
}
|
|
41
|
+
std::vector<uint8_t> bytes(8 * sizeof(uint64_t), 0);
|
|
42
|
+
uint8_t offset = 0;
|
|
43
|
+
uint8_t* ptr = bytes.data();
|
|
44
|
+
for (int i = 0; i < 8; ++i) {
|
|
45
|
+
offset = pack_bits(input[i], bits, ptr, offset);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
std::vector<uint64_t> output(8, 0);
|
|
49
|
+
offset = 0;
|
|
50
|
+
const uint8_t* cptr = bytes.data();
|
|
51
|
+
for (int i = 0; i < 8; ++i) {
|
|
52
|
+
offset = unpack_bits(output[i], bits, cptr, offset);
|
|
53
|
+
}
|
|
54
|
+
for (int i = 0; i < 8; ++i) {
|
|
55
|
+
REQUIRE((input[i] & mask) == output[i]);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
TEST_CASE("pack unpack blocks") {
|
|
61
|
+
for (uint8_t bits = 1; bits <= 63; ++bits) {
|
|
62
|
+
const uint64_t mask = (1ULL << bits) - 1;
|
|
63
|
+
std::vector<uint64_t> input(8, 0);
|
|
64
|
+
const uint64_t igolden64 = IGOLDEN64;
|
|
65
|
+
uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
|
|
66
|
+
for (int i = 0; i < 8; ++i) {
|
|
67
|
+
input[i] = value & mask;
|
|
68
|
+
value += igolden64;
|
|
69
|
+
}
|
|
70
|
+
std::vector<uint8_t> bytes(8 * sizeof(uint64_t), 0);
|
|
71
|
+
pack_bits_block8(input.data(), bytes.data(), bits);
|
|
72
|
+
std::vector<uint64_t> output(8, 0);
|
|
73
|
+
unpack_bits_block8(output.data(), bytes.data(), bits);
|
|
74
|
+
for (int i = 0; i < 8; ++i) {
|
|
75
|
+
REQUIRE((input[i] & mask) == output[i]);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
} /* namespace datasketches */
|
|
@@ -607,7 +607,7 @@ TEST_CASE("theta sketch: wrap compact estimation from java", "[theta_sketch]") {
|
|
|
607
607
|
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
608
608
|
// the sketches are ordered, so the iteration sequence must match exactly
|
|
609
609
|
auto iter = sketch.begin();
|
|
610
|
-
for (const auto
|
|
610
|
+
for (const auto key: compact_sketch) {
|
|
611
611
|
REQUIRE(*iter == key);
|
|
612
612
|
++iter;
|
|
613
613
|
}
|
|
@@ -652,7 +652,7 @@ TEST_CASE("theta sketch: wrap compact v1 estimation from java", "[theta_sketch]"
|
|
|
652
652
|
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
653
653
|
// the sketches are ordered, so the iteration sequence must match exactly
|
|
654
654
|
auto iter = sketch.begin();
|
|
655
|
-
for (const auto
|
|
655
|
+
for (const auto key: compact_sketch) {
|
|
656
656
|
REQUIRE(*iter == key);
|
|
657
657
|
++iter;
|
|
658
658
|
}
|
|
@@ -697,7 +697,46 @@ TEST_CASE("theta sketch: wrap compact v2 estimation from java", "[theta_sketch]"
|
|
|
697
697
|
compact_theta_sketch compact_sketch = update_sketch.compact();
|
|
698
698
|
// the sketches are ordered, so the iteration sequence must match exactly
|
|
699
699
|
auto iter = sketch.begin();
|
|
700
|
-
for (const auto
|
|
700
|
+
for (const auto key: compact_sketch) {
|
|
701
|
+
REQUIRE(*iter == key);
|
|
702
|
+
++iter;
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
|
707
|
+
auto update_sketch = update_theta_sketch::builder().build();
|
|
708
|
+
for (int i = 0; i < 10000; i++) update_sketch.update(i);
|
|
709
|
+
auto compact_sketch = update_sketch.compact();
|
|
710
|
+
|
|
711
|
+
auto bytes = compact_sketch.serialize_compressed();
|
|
712
|
+
{ // deserialize bytes
|
|
713
|
+
auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
|
714
|
+
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
|
715
|
+
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
|
716
|
+
auto iter = deserialized_sketch.begin();
|
|
717
|
+
for (const auto key: compact_sketch) {
|
|
718
|
+
REQUIRE(*iter == key);
|
|
719
|
+
++iter;
|
|
720
|
+
}
|
|
721
|
+
}
|
|
722
|
+
{ // wrap bytes
|
|
723
|
+
auto wrapped_sketch = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
|
|
724
|
+
REQUIRE(wrapped_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
|
725
|
+
REQUIRE(wrapped_sketch.get_theta() == compact_sketch.get_theta());
|
|
726
|
+
auto iter = wrapped_sketch.begin();
|
|
727
|
+
for (const auto key: compact_sketch) {
|
|
728
|
+
REQUIRE(*iter == key);
|
|
729
|
+
++iter;
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
734
|
+
compact_sketch.serialize_compressed(s);
|
|
735
|
+
auto deserialized_sketch = compact_theta_sketch::deserialize(s);
|
|
736
|
+
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
|
737
|
+
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
|
738
|
+
auto iter = deserialized_sketch.begin();
|
|
739
|
+
for (const auto key: compact_sketch) {
|
|
701
740
|
REQUIRE(*iter == key);
|
|
702
741
|
++iter;
|
|
703
742
|
}
|
|
@@ -128,4 +128,29 @@ TEST_CASE("theta union: seed mismatch", "[theta_union]") {
|
|
|
128
128
|
REQUIRE_THROWS_AS(u.update(sketch), std::invalid_argument);
|
|
129
129
|
}
|
|
130
130
|
|
|
131
|
+
TEST_CASE("theta union: larger K", "[theta_union]") {
|
|
132
|
+
auto update_sketch1 = datasketches::update_theta_sketch::builder().set_lg_k(14).build();
|
|
133
|
+
for(int i = 0; i < 16384; ++i) update_sketch1.update(i);
|
|
134
|
+
|
|
135
|
+
auto update_sketch2 = datasketches::update_theta_sketch::builder().set_lg_k(14).build();
|
|
136
|
+
for(int i = 0; i < 26384; ++i) update_sketch2.update(i);
|
|
137
|
+
|
|
138
|
+
auto update_sketch3 = datasketches::update_theta_sketch::builder().set_lg_k(14).build();
|
|
139
|
+
for(int i = 0; i < 86384; ++i) update_sketch3.update(i);
|
|
140
|
+
|
|
141
|
+
auto union1 = datasketches::theta_union::builder().set_lg_k(16).build();
|
|
142
|
+
union1.update(update_sketch2);
|
|
143
|
+
union1.update(update_sketch1);
|
|
144
|
+
union1.update(update_sketch3);
|
|
145
|
+
auto result1 = union1.get_result();
|
|
146
|
+
REQUIRE(result1.get_estimate() == update_sketch3.get_estimate());
|
|
147
|
+
|
|
148
|
+
auto union2 = datasketches::theta_union::builder().set_lg_k(16).build();
|
|
149
|
+
union2.update(update_sketch1);
|
|
150
|
+
union2.update(update_sketch3);
|
|
151
|
+
union2.update(update_sketch2);
|
|
152
|
+
auto result2 = union2.get_result();
|
|
153
|
+
REQUIRE(result2.get_estimate() == update_sketch3.get_estimate());
|
|
154
|
+
}
|
|
155
|
+
|
|
131
156
|
} /* namespace datasketches */
|
|
@@ -32,7 +32,8 @@ bool tuple_sketch<S, A>::is_estimation_mode() const {
|
|
|
32
32
|
|
|
33
33
|
template<typename S, typename A>
|
|
34
34
|
double tuple_sketch<S, A>::get_theta() const {
|
|
35
|
-
return static_cast<double>(get_theta64()) /
|
|
35
|
+
return static_cast<double>(get_theta64()) /
|
|
36
|
+
static_cast<double>(theta_constants::MAX_THETA);
|
|
36
37
|
}
|
|
37
38
|
|
|
38
39
|
template<typename S, typename A>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
4.0
|
|
1
|
+
4.1.0
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: datasketches
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2023-
|
|
11
|
+
date: 2023-05-03 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rice
|
|
@@ -76,6 +76,12 @@ files:
|
|
|
76
76
|
- vendor/datasketches-cpp/common/test/test_allocator.cpp
|
|
77
77
|
- vendor/datasketches-cpp/common/test/test_allocator.hpp
|
|
78
78
|
- vendor/datasketches-cpp/common/test/test_type.hpp
|
|
79
|
+
- vendor/datasketches-cpp/count/CMakeLists.txt
|
|
80
|
+
- vendor/datasketches-cpp/count/include/count_min.hpp
|
|
81
|
+
- vendor/datasketches-cpp/count/include/count_min_impl.hpp
|
|
82
|
+
- vendor/datasketches-cpp/count/test/CMakeLists.txt
|
|
83
|
+
- vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp
|
|
84
|
+
- vendor/datasketches-cpp/count/test/count_min_test.cpp
|
|
79
85
|
- vendor/datasketches-cpp/cpc/CMakeLists.txt
|
|
80
86
|
- vendor/datasketches-cpp/cpc/include/compression_data.hpp
|
|
81
87
|
- vendor/datasketches-cpp/cpc/include/cpc_common.hpp
|
|
@@ -96,6 +102,11 @@ files:
|
|
|
96
102
|
- vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp
|
|
97
103
|
- vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp
|
|
98
104
|
- vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp
|
|
105
|
+
- vendor/datasketches-cpp/density/CMakeLists.txt
|
|
106
|
+
- vendor/datasketches-cpp/density/include/density_sketch.hpp
|
|
107
|
+
- vendor/datasketches-cpp/density/include/density_sketch_impl.hpp
|
|
108
|
+
- vendor/datasketches-cpp/density/test/CMakeLists.txt
|
|
109
|
+
- vendor/datasketches-cpp/density/test/density_sketch_test.cpp
|
|
99
110
|
- vendor/datasketches-cpp/fi/CMakeLists.txt
|
|
100
111
|
- vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp
|
|
101
112
|
- vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp
|
|
@@ -173,9 +184,18 @@ files:
|
|
|
173
184
|
- vendor/datasketches-cpp/pyproject.toml
|
|
174
185
|
- vendor/datasketches-cpp/python/CMakeLists.txt
|
|
175
186
|
- vendor/datasketches-cpp/python/README.md
|
|
187
|
+
- vendor/datasketches-cpp/python/datasketches/DensityWrapper.py
|
|
188
|
+
- vendor/datasketches-cpp/python/datasketches/KernelFunction.py
|
|
176
189
|
- vendor/datasketches-cpp/python/datasketches/PySerDe.py
|
|
190
|
+
- vendor/datasketches-cpp/python/datasketches/TuplePolicy.py
|
|
191
|
+
- vendor/datasketches-cpp/python/datasketches/TupleWrapper.py
|
|
177
192
|
- vendor/datasketches-cpp/python/datasketches/__init__.py
|
|
193
|
+
- vendor/datasketches-cpp/python/include/kernel_function.hpp
|
|
194
|
+
- vendor/datasketches-cpp/python/include/py_object_lt.hpp
|
|
195
|
+
- vendor/datasketches-cpp/python/include/py_object_ostream.hpp
|
|
178
196
|
- vendor/datasketches-cpp/python/include/py_serde.hpp
|
|
197
|
+
- vendor/datasketches-cpp/python/include/quantile_conditional.hpp
|
|
198
|
+
- vendor/datasketches-cpp/python/include/tuple_policy.hpp
|
|
179
199
|
- vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb
|
|
180
200
|
- vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb
|
|
181
201
|
- vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb
|
|
@@ -183,8 +203,10 @@ files:
|
|
|
183
203
|
- vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb
|
|
184
204
|
- vendor/datasketches-cpp/python/pybind11Path.cmd
|
|
185
205
|
- vendor/datasketches-cpp/python/src/__init__.py
|
|
206
|
+
- vendor/datasketches-cpp/python/src/count_wrapper.cpp
|
|
186
207
|
- vendor/datasketches-cpp/python/src/cpc_wrapper.cpp
|
|
187
208
|
- vendor/datasketches-cpp/python/src/datasketches.cpp
|
|
209
|
+
- vendor/datasketches-cpp/python/src/density_wrapper.cpp
|
|
188
210
|
- vendor/datasketches-cpp/python/src/fi_wrapper.cpp
|
|
189
211
|
- vendor/datasketches-cpp/python/src/hll_wrapper.cpp
|
|
190
212
|
- vendor/datasketches-cpp/python/src/kll_wrapper.cpp
|
|
@@ -193,16 +215,20 @@ files:
|
|
|
193
215
|
- vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp
|
|
194
216
|
- vendor/datasketches-cpp/python/src/req_wrapper.cpp
|
|
195
217
|
- vendor/datasketches-cpp/python/src/theta_wrapper.cpp
|
|
218
|
+
- vendor/datasketches-cpp/python/src/tuple_wrapper.cpp
|
|
196
219
|
- vendor/datasketches-cpp/python/src/vector_of_kll.cpp
|
|
197
220
|
- vendor/datasketches-cpp/python/src/vo_wrapper.cpp
|
|
198
221
|
- vendor/datasketches-cpp/python/tests/__init__.py
|
|
222
|
+
- vendor/datasketches-cpp/python/tests/count_min_test.py
|
|
199
223
|
- vendor/datasketches-cpp/python/tests/cpc_test.py
|
|
224
|
+
- vendor/datasketches-cpp/python/tests/density_test.py
|
|
200
225
|
- vendor/datasketches-cpp/python/tests/fi_test.py
|
|
201
226
|
- vendor/datasketches-cpp/python/tests/hll_test.py
|
|
202
227
|
- vendor/datasketches-cpp/python/tests/kll_test.py
|
|
203
228
|
- vendor/datasketches-cpp/python/tests/quantiles_test.py
|
|
204
229
|
- vendor/datasketches-cpp/python/tests/req_test.py
|
|
205
230
|
- vendor/datasketches-cpp/python/tests/theta_test.py
|
|
231
|
+
- vendor/datasketches-cpp/python/tests/tuple_test.py
|
|
206
232
|
- vendor/datasketches-cpp/python/tests/vector_of_kll_test.py
|
|
207
233
|
- vendor/datasketches-cpp/python/tests/vo_test.py
|
|
208
234
|
- vendor/datasketches-cpp/quantiles/CMakeLists.txt
|
|
@@ -249,6 +275,7 @@ files:
|
|
|
249
275
|
- vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk
|
|
250
276
|
- vendor/datasketches-cpp/setup.py
|
|
251
277
|
- vendor/datasketches-cpp/theta/CMakeLists.txt
|
|
278
|
+
- vendor/datasketches-cpp/theta/include/bit_packing.hpp
|
|
252
279
|
- vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp
|
|
253
280
|
- vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
|
254
281
|
- vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp
|
|
@@ -275,6 +302,7 @@ files:
|
|
|
275
302
|
- vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp
|
|
276
303
|
- vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp
|
|
277
304
|
- vendor/datasketches-cpp/theta/test/CMakeLists.txt
|
|
305
|
+
- vendor/datasketches-cpp/theta/test/bit_packing_test.cpp
|
|
278
306
|
- vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp
|
|
279
307
|
- vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk
|
|
280
308
|
- vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk
|
|
@@ -342,7 +370,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
342
370
|
- !ruby/object:Gem::Version
|
|
343
371
|
version: '0'
|
|
344
372
|
requirements: []
|
|
345
|
-
rubygems_version: 3.4.
|
|
373
|
+
rubygems_version: 3.4.10
|
|
346
374
|
signing_key:
|
|
347
375
|
specification_version: 4
|
|
348
376
|
summary: Sketch data structures for Ruby
|