datasketches 0.2.0 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -31,9 +31,9 @@ static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
|
|
31
31
|
else return quotient + 1;
|
32
32
|
}
|
33
33
|
|
34
|
-
static inline
|
35
|
-
if (x < 1) throw std::invalid_argument("
|
36
|
-
|
34
|
+
static inline uint8_t floor_log2_of_long(uint64_t x) {
|
35
|
+
if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument");
|
36
|
+
uint8_t p = 0;
|
37
37
|
uint64_t y = 1;
|
38
38
|
while (true) {
|
39
39
|
if (y == x) return p;
|
@@ -69,7 +69,7 @@ static inline uint64_t wegner_count_bits_set_in_matrix(const uint64_t* array, si
|
|
69
69
|
// Note: this is an adaptation of the Java code,
|
70
70
|
// which is apparently a variation of Figure 5-2 in "Hacker's Delight"
|
71
71
|
// by Henry S. Warren.
|
72
|
-
static inline
|
72
|
+
static inline uint32_t warren_bit_count(uint64_t i) {
|
73
73
|
i = i - ((i >> 1) & 0x5555555555555555ULL);
|
74
74
|
i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
|
75
75
|
i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
|
@@ -79,9 +79,9 @@ static inline uint64_t warren_bit_count(uint64_t i) {
|
|
79
79
|
return i & 0x7f;
|
80
80
|
}
|
81
81
|
|
82
|
-
static inline
|
83
|
-
|
84
|
-
for (
|
82
|
+
static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, uint32_t length) {
|
83
|
+
uint32_t count = 0;
|
84
|
+
for (uint32_t i = 0; i < length; i++) {
|
85
85
|
count += warren_bit_count(array[i]);
|
86
86
|
}
|
87
87
|
return count;
|
@@ -91,13 +91,13 @@ static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, si
|
|
91
91
|
|
92
92
|
#define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
|
93
93
|
|
94
|
-
static inline
|
94
|
+
static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
|
95
95
|
if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
|
96
|
-
|
96
|
+
uint32_t total = 0;
|
97
97
|
uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights;
|
98
98
|
fours = twos = ones = 0;
|
99
99
|
|
100
|
-
for (
|
100
|
+
for (uint32_t i = 0; i <= length - 8; i += 8) {
|
101
101
|
CSA(twos_a, ones, ones, a[i+0], a[i+1]);
|
102
102
|
CSA(twos_b, ones, ones, a[i+2], a[i+3]);
|
103
103
|
CSA(fours_a, twos, twos, twos_a, twos_b);
|
@@ -245,12 +245,12 @@ static inline double icon_exponential_approximation(double k, double c) {
|
|
245
245
|
return (0.7940236163830469 * k * pow(2.0, c / k));
|
246
246
|
}
|
247
247
|
|
248
|
-
static inline double compute_icon_estimate(uint8_t lg_k,
|
248
|
+
static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) {
|
249
249
|
if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
|
250
250
|
if (c < 2) return ((c == 0) ? 0.0 : 1.0);
|
251
|
-
const
|
252
|
-
const double double_k = k;
|
253
|
-
const double double_c = c;
|
251
|
+
const uint32_t k = 1 << lg_k;
|
252
|
+
const double double_k = static_cast<double>(k);
|
253
|
+
const double double_c = static_cast<double>(c);
|
254
254
|
// Differing thresholds ensure that the approximated estimator is monotonically increasing.
|
255
255
|
const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
|
256
256
|
if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
|
@@ -29,11 +29,11 @@
|
|
29
29
|
|
30
30
|
namespace datasketches {
|
31
31
|
|
32
|
-
static const
|
33
|
-
static const
|
32
|
+
static const uint32_t U32_TABLE_UPSIZE_NUMER = 3LL;
|
33
|
+
static const uint32_t U32_TABLE_UPSIZE_DENOM = 4LL;
|
34
34
|
|
35
|
-
static const
|
36
|
-
static const
|
35
|
+
static const uint32_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
|
36
|
+
static const uint32_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
|
37
37
|
|
38
38
|
template<typename A>
|
39
39
|
class u32_table {
|
@@ -42,7 +42,7 @@ public:
|
|
42
42
|
u32_table(const A& allocator);
|
43
43
|
u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
|
44
44
|
|
45
|
-
inline
|
45
|
+
inline uint32_t get_num_items() const;
|
46
46
|
inline const uint32_t* get_slots() const;
|
47
47
|
inline uint8_t get_lg_size() const;
|
48
48
|
inline void clear();
|
@@ -52,7 +52,7 @@ public:
|
|
52
52
|
// returns true iff the item was present and was therefore removed from the table
|
53
53
|
inline bool maybe_delete(uint32_t item);
|
54
54
|
|
55
|
-
static u32_table make_from_pairs(const uint32_t* pairs,
|
55
|
+
static u32_table make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator);
|
56
56
|
|
57
57
|
vector_u32<A> unwrapping_get_items() const;
|
58
58
|
|
@@ -69,10 +69,10 @@ private:
|
|
69
69
|
|
70
70
|
uint8_t lg_size; // log2 of number of slots
|
71
71
|
uint8_t num_valid_bits;
|
72
|
-
|
72
|
+
uint32_t num_items;
|
73
73
|
vector_u32<A> slots;
|
74
74
|
|
75
|
-
inline
|
75
|
+
inline uint32_t lookup(uint32_t item) const;
|
76
76
|
inline void must_insert(uint32_t item);
|
77
77
|
inline void rebuild(uint8_t new_lg_size);
|
78
78
|
};
|
@@ -41,14 +41,14 @@ u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& alloca
|
|
41
41
|
lg_size(lg_size),
|
42
42
|
num_valid_bits(num_valid_bits),
|
43
43
|
num_items(0),
|
44
|
-
slots(
|
44
|
+
slots(1ULL << lg_size, UINT32_MAX, allocator)
|
45
45
|
{
|
46
46
|
if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
|
47
47
|
if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
|
48
48
|
}
|
49
49
|
|
50
50
|
template<typename A>
|
51
|
-
|
51
|
+
uint32_t u32_table<A>::get_num_items() const {
|
52
52
|
return num_items;
|
53
53
|
}
|
54
54
|
|
@@ -70,7 +70,7 @@ void u32_table<A>::clear() {
|
|
70
70
|
|
71
71
|
template<typename A>
|
72
72
|
bool u32_table<A>::maybe_insert(uint32_t item) {
|
73
|
-
const
|
73
|
+
const uint32_t index = lookup(item);
|
74
74
|
if (slots[index] == item) return false;
|
75
75
|
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
76
76
|
slots[index] = item;
|
@@ -83,7 +83,7 @@ bool u32_table<A>::maybe_insert(uint32_t item) {
|
|
83
83
|
|
84
84
|
template<typename A>
|
85
85
|
bool u32_table<A>::maybe_delete(uint32_t item) {
|
86
|
-
const
|
86
|
+
const uint32_t index = lookup(item);
|
87
87
|
if (slots[index] == UINT32_MAX) return false;
|
88
88
|
if (slots[index] != item) throw std::logic_error("item does not exist");
|
89
89
|
if (num_items == 0) throw std::logic_error("delete error");
|
@@ -110,7 +110,7 @@ bool u32_table<A>::maybe_delete(uint32_t item) {
|
|
110
110
|
|
111
111
|
// this one is specifically tailored to be a part of fm85 decompression scheme
|
112
112
|
template<typename A>
|
113
|
-
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs,
|
113
|
+
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator) {
|
114
114
|
uint8_t lg_num_slots = 2;
|
115
115
|
while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
|
116
116
|
u32_table<A> table(lg_num_slots, 6 + lg_k, allocator);
|
@@ -124,11 +124,11 @@ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pai
|
|
124
124
|
}
|
125
125
|
|
126
126
|
template<typename A>
|
127
|
-
|
128
|
-
const
|
129
|
-
const
|
127
|
+
uint32_t u32_table<A>::lookup(uint32_t item) const {
|
128
|
+
const uint32_t size = 1 << lg_size;
|
129
|
+
const uint32_t mask = size - 1;
|
130
130
|
const uint8_t shift = num_valid_bits - lg_size;
|
131
|
-
|
131
|
+
uint32_t probe = item >> shift;
|
132
132
|
if (probe > mask) throw std::logic_error("probe out of range");
|
133
133
|
while (slots[probe] != item && slots[probe] != UINT32_MAX) {
|
134
134
|
probe = (probe + 1) & mask;
|
@@ -139,7 +139,7 @@ size_t u32_table<A>::lookup(uint32_t item) const {
|
|
139
139
|
// counts and resizing must be handled by the caller
|
140
140
|
template<typename A>
|
141
141
|
void u32_table<A>::must_insert(uint32_t item) {
|
142
|
-
const
|
142
|
+
const uint32_t index = lookup(item);
|
143
143
|
if (slots[index] == item) throw std::logic_error("item exists");
|
144
144
|
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
145
145
|
slots[index] = item;
|
@@ -148,13 +148,13 @@ void u32_table<A>::must_insert(uint32_t item) {
|
|
148
148
|
template<typename A>
|
149
149
|
void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
150
150
|
if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
|
151
|
-
const
|
152
|
-
const
|
151
|
+
const uint32_t old_size = 1 << lg_size;
|
152
|
+
const uint32_t new_size = 1 << new_lg_size;
|
153
153
|
if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
|
154
154
|
vector_u32<A> old_slots = std::move(slots);
|
155
155
|
slots = vector_u32<A>(new_size, UINT32_MAX, old_slots.get_allocator());
|
156
156
|
lg_size = new_lg_size;
|
157
|
-
for (
|
157
|
+
for (uint32_t i = 0; i < old_size; i++) {
|
158
158
|
if (old_slots[i] != UINT32_MAX) {
|
159
159
|
must_insert(old_slots[i]);
|
160
160
|
}
|
@@ -170,7 +170,7 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
|
170
170
|
template<typename A>
|
171
171
|
vector_u32<A> u32_table<A>::unwrapping_get_items() const {
|
172
172
|
if (num_items == 0) return vector_u32<A>(slots.get_allocator());
|
173
|
-
const
|
173
|
+
const uint32_t table_size = 1 << lg_size;
|
174
174
|
vector_u32<A> result(num_items, 0, slots.get_allocator());
|
175
175
|
size_t i = 0;
|
176
176
|
size_t l = 0;
|
@@ -27,38 +27,38 @@ namespace datasketches {
|
|
27
27
|
typedef u32_table<std::allocator<void>> table;
|
28
28
|
|
29
29
|
TEST_CASE("cpc sketch: compress and decompress pairs", "[cpc_sketch]") {
|
30
|
-
const
|
31
|
-
const
|
30
|
+
const size_t N = 200;
|
31
|
+
const size_t MAXWORDS = 1000;
|
32
32
|
|
33
33
|
HashState twoHashes;
|
34
34
|
uint32_t pairArray[N];
|
35
35
|
uint32_t pairArray2[N];
|
36
36
|
uint64_t value = 35538947; // some arbitrary starting value
|
37
37
|
const uint64_t golden64 = 0x9e3779b97f4a7c13ULL; // the golden ratio
|
38
|
-
for (
|
38
|
+
for (size_t i = 0; i < N; i++) {
|
39
39
|
MurmurHash3_x64_128(&value, sizeof(value), 0, twoHashes);
|
40
40
|
uint32_t rand = twoHashes.h1 & 0xffff;
|
41
41
|
pairArray[i] = rand;
|
42
42
|
value += golden64;
|
43
43
|
}
|
44
44
|
//table::knuth_shell_sort3(pairArray, 0, N - 1); // unsigned numerical sort
|
45
|
-
std::sort(pairArray,
|
45
|
+
std::sort(pairArray, pairArray + N);
|
46
46
|
uint32_t prev = UINT32_MAX;
|
47
|
-
|
48
|
-
for (
|
47
|
+
uint32_t nxt = 0;
|
48
|
+
for (size_t i = 0; i < N; i++) { // uniquify
|
49
49
|
if (pairArray[i] != prev) {
|
50
50
|
prev = pairArray[i];
|
51
51
|
pairArray[nxt++] = pairArray[i];
|
52
52
|
}
|
53
53
|
}
|
54
|
-
|
54
|
+
uint32_t numPairs = nxt;
|
55
55
|
|
56
56
|
uint32_t compressedWords[MAXWORDS];
|
57
57
|
|
58
|
-
for (
|
59
|
-
|
58
|
+
for (uint8_t numBaseBits = 0; numBaseBits <= 11; numBaseBits++) {
|
59
|
+
uint32_t numWordsWritten = get_compressor<std::allocator<void>>().low_level_compress_pairs(pairArray, numPairs, numBaseBits, compressedWords);
|
60
60
|
get_compressor<std::allocator<void>>().low_level_uncompress_pairs(pairArray2, numPairs, numBaseBits, compressedWords, numWordsWritten);
|
61
|
-
for (
|
61
|
+
for (size_t i = 0; i < numPairs; i++) {
|
62
62
|
REQUIRE(pairArray[i] == pairArray2[i]);
|
63
63
|
}
|
64
64
|
}
|
@@ -25,6 +25,7 @@
|
|
25
25
|
#include <catch.hpp>
|
26
26
|
|
27
27
|
#include "cpc_sketch.hpp"
|
28
|
+
#include "cpc_union.hpp"
|
28
29
|
#include "test_allocator.hpp"
|
29
30
|
|
30
31
|
namespace datasketches {
|
@@ -234,4 +235,20 @@ TEST_CASE("cpc sketch allocation: serialize deserialize sliding, bytes", "[cpc_s
|
|
234
235
|
REQUIRE(test_allocator_net_allocations == 0);
|
235
236
|
}
|
236
237
|
|
238
|
+
using cpc_union_test_alloc = cpc_union_alloc<test_allocator<uint8_t>>;
|
239
|
+
|
240
|
+
TEST_CASE("cpc sketch allocation: union") {
|
241
|
+
cpc_sketch_test_alloc s1(11, DEFAULT_SEED, 0);
|
242
|
+
s1.update(1);
|
243
|
+
|
244
|
+
cpc_sketch_test_alloc s2(11, DEFAULT_SEED, 0);
|
245
|
+
s2.update(2);
|
246
|
+
|
247
|
+
cpc_union_test_alloc u(11, DEFAULT_SEED, 0);
|
248
|
+
u.update(s1);
|
249
|
+
u.update(s2);
|
250
|
+
auto s3 = u.get_result();
|
251
|
+
REQUIRE_FALSE(s3.is_empty());
|
252
|
+
}
|
253
|
+
|
237
254
|
} /* namespace datasketches */
|
@@ -283,6 +283,26 @@ TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
|
|
283
283
|
REQUIRE(deserialized.validate());
|
284
284
|
}
|
285
285
|
|
286
|
+
TEST_CASE("cpc sketch: serialize deserialize sliding huge", "[cpc_sketch]") {
|
287
|
+
cpc_sketch sketch(26);
|
288
|
+
const int n = 10000000;
|
289
|
+
for (int i = 0; i < n; i++) sketch.update(i);
|
290
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.001));
|
291
|
+
auto bytes = sketch.serialize();
|
292
|
+
cpc_sketch deserialized = cpc_sketch::deserialize(bytes.data(), bytes.size());
|
293
|
+
REQUIRE(deserialized.is_empty() == sketch.is_empty());
|
294
|
+
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
295
|
+
REQUIRE(deserialized.validate());
|
296
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 7), std::out_of_range);
|
297
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 15), std::out_of_range);
|
298
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
299
|
+
|
300
|
+
// updating again with the same values should not change the sketch
|
301
|
+
for (int i = 0; i < n; i++) deserialized.update(i);
|
302
|
+
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
303
|
+
REQUIRE(deserialized.validate());
|
304
|
+
}
|
305
|
+
|
286
306
|
TEST_CASE("cpc sketch: copy", "[cpc_sketch]") {
|
287
307
|
cpc_sketch s1(11);
|
288
308
|
s1.update(1);
|
@@ -378,4 +398,9 @@ TEST_CASE("cpc sketch: update string equivalence", "[cpc_sketch]") {
|
|
378
398
|
REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
|
379
399
|
}
|
380
400
|
|
401
|
+
TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
|
402
|
+
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
|
403
|
+
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
|
404
|
+
}
|
405
|
+
|
381
406
|
} /* namespace datasketches */
|
@@ -32,23 +32,13 @@ target_include_directories(fi
|
|
32
32
|
target_link_libraries(fi INTERFACE common)
|
33
33
|
target_compile_features(fi INTERFACE cxx_std_11)
|
34
34
|
|
35
|
-
set(fi_HEADERS "")
|
36
|
-
list(APPEND fi_HEADERS "include/frequent_items_sketch.hpp")
|
37
|
-
list(APPEND fi_HEADERS "include/frequent_items_sketch_impl.hpp")
|
38
|
-
list(APPEND fi_HEADERS "include/reverse_purge_hash_map.hpp")
|
39
|
-
list(APPEND fi_HEADERS "include/reverse_purge_hash_map_impl.hpp")
|
40
|
-
|
41
35
|
install(TARGETS fi
|
42
36
|
EXPORT ${PROJECT_NAME}
|
43
37
|
)
|
44
38
|
|
45
|
-
install(FILES
|
39
|
+
install(FILES
|
40
|
+
include/frequent_items_sketch.hpp
|
41
|
+
include/frequent_items_sketch_impl.hpp
|
42
|
+
include/reverse_purge_hash_map.hpp
|
43
|
+
include/reverse_purge_hash_map_impl.hpp
|
46
44
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
47
|
-
|
48
|
-
target_sources(fi
|
49
|
-
INTERFACE
|
50
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch.hpp
|
51
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch_impl.hpp
|
52
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map.hpp
|
53
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map_impl.hpp
|
54
|
-
)
|