datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -31,9 +31,9 @@ static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
|
|
|
31
31
|
else return quotient + 1;
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
-
static inline
|
|
35
|
-
if (x < 1) throw std::invalid_argument("
|
|
36
|
-
|
|
34
|
+
static inline uint8_t floor_log2_of_long(uint64_t x) {
|
|
35
|
+
if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument");
|
|
36
|
+
uint8_t p = 0;
|
|
37
37
|
uint64_t y = 1;
|
|
38
38
|
while (true) {
|
|
39
39
|
if (y == x) return p;
|
|
@@ -69,7 +69,7 @@ static inline uint64_t wegner_count_bits_set_in_matrix(const uint64_t* array, si
|
|
|
69
69
|
// Note: this is an adaptation of the Java code,
|
|
70
70
|
// which is apparently a variation of Figure 5-2 in "Hacker's Delight"
|
|
71
71
|
// by Henry S. Warren.
|
|
72
|
-
static inline
|
|
72
|
+
static inline uint32_t warren_bit_count(uint64_t i) {
|
|
73
73
|
i = i - ((i >> 1) & 0x5555555555555555ULL);
|
|
74
74
|
i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
|
|
75
75
|
i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
|
|
@@ -79,9 +79,9 @@ static inline uint64_t warren_bit_count(uint64_t i) {
|
|
|
79
79
|
return i & 0x7f;
|
|
80
80
|
}
|
|
81
81
|
|
|
82
|
-
static inline
|
|
83
|
-
|
|
84
|
-
for (
|
|
82
|
+
static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, uint32_t length) {
|
|
83
|
+
uint32_t count = 0;
|
|
84
|
+
for (uint32_t i = 0; i < length; i++) {
|
|
85
85
|
count += warren_bit_count(array[i]);
|
|
86
86
|
}
|
|
87
87
|
return count;
|
|
@@ -91,13 +91,13 @@ static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, si
|
|
|
91
91
|
|
|
92
92
|
#define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
|
|
93
93
|
|
|
94
|
-
static inline
|
|
94
|
+
static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
|
|
95
95
|
if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
|
|
96
|
-
|
|
96
|
+
uint32_t total = 0;
|
|
97
97
|
uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights;
|
|
98
98
|
fours = twos = ones = 0;
|
|
99
99
|
|
|
100
|
-
for (
|
|
100
|
+
for (uint32_t i = 0; i <= length - 8; i += 8) {
|
|
101
101
|
CSA(twos_a, ones, ones, a[i+0], a[i+1]);
|
|
102
102
|
CSA(twos_b, ones, ones, a[i+2], a[i+3]);
|
|
103
103
|
CSA(fours_a, twos, twos, twos_a, twos_b);
|
|
@@ -245,12 +245,12 @@ static inline double icon_exponential_approximation(double k, double c) {
|
|
|
245
245
|
return (0.7940236163830469 * k * pow(2.0, c / k));
|
|
246
246
|
}
|
|
247
247
|
|
|
248
|
-
static inline double compute_icon_estimate(uint8_t lg_k,
|
|
248
|
+
static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) {
|
|
249
249
|
if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
|
|
250
250
|
if (c < 2) return ((c == 0) ? 0.0 : 1.0);
|
|
251
|
-
const
|
|
252
|
-
const double double_k = k;
|
|
253
|
-
const double double_c = c;
|
|
251
|
+
const uint32_t k = 1 << lg_k;
|
|
252
|
+
const double double_k = static_cast<double>(k);
|
|
253
|
+
const double double_c = static_cast<double>(c);
|
|
254
254
|
// Differing thresholds ensure that the approximated estimator is monotonically increasing.
|
|
255
255
|
const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
|
|
256
256
|
if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
|
|
@@ -29,11 +29,11 @@
|
|
|
29
29
|
|
|
30
30
|
namespace datasketches {
|
|
31
31
|
|
|
32
|
-
static const
|
|
33
|
-
static const
|
|
32
|
+
static const uint32_t U32_TABLE_UPSIZE_NUMER = 3LL;
|
|
33
|
+
static const uint32_t U32_TABLE_UPSIZE_DENOM = 4LL;
|
|
34
34
|
|
|
35
|
-
static const
|
|
36
|
-
static const
|
|
35
|
+
static const uint32_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
|
|
36
|
+
static const uint32_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
|
|
37
37
|
|
|
38
38
|
template<typename A>
|
|
39
39
|
class u32_table {
|
|
@@ -42,7 +42,7 @@ public:
|
|
|
42
42
|
u32_table(const A& allocator);
|
|
43
43
|
u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
|
|
44
44
|
|
|
45
|
-
inline
|
|
45
|
+
inline uint32_t get_num_items() const;
|
|
46
46
|
inline const uint32_t* get_slots() const;
|
|
47
47
|
inline uint8_t get_lg_size() const;
|
|
48
48
|
inline void clear();
|
|
@@ -52,7 +52,7 @@ public:
|
|
|
52
52
|
// returns true iff the item was present and was therefore removed from the table
|
|
53
53
|
inline bool maybe_delete(uint32_t item);
|
|
54
54
|
|
|
55
|
-
static u32_table make_from_pairs(const uint32_t* pairs,
|
|
55
|
+
static u32_table make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator);
|
|
56
56
|
|
|
57
57
|
vector_u32<A> unwrapping_get_items() const;
|
|
58
58
|
|
|
@@ -69,10 +69,10 @@ private:
|
|
|
69
69
|
|
|
70
70
|
uint8_t lg_size; // log2 of number of slots
|
|
71
71
|
uint8_t num_valid_bits;
|
|
72
|
-
|
|
72
|
+
uint32_t num_items;
|
|
73
73
|
vector_u32<A> slots;
|
|
74
74
|
|
|
75
|
-
inline
|
|
75
|
+
inline uint32_t lookup(uint32_t item) const;
|
|
76
76
|
inline void must_insert(uint32_t item);
|
|
77
77
|
inline void rebuild(uint8_t new_lg_size);
|
|
78
78
|
};
|
|
@@ -41,14 +41,14 @@ u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& alloca
|
|
|
41
41
|
lg_size(lg_size),
|
|
42
42
|
num_valid_bits(num_valid_bits),
|
|
43
43
|
num_items(0),
|
|
44
|
-
slots(
|
|
44
|
+
slots(1ULL << lg_size, UINT32_MAX, allocator)
|
|
45
45
|
{
|
|
46
46
|
if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
|
|
47
47
|
if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
template<typename A>
|
|
51
|
-
|
|
51
|
+
uint32_t u32_table<A>::get_num_items() const {
|
|
52
52
|
return num_items;
|
|
53
53
|
}
|
|
54
54
|
|
|
@@ -70,7 +70,7 @@ void u32_table<A>::clear() {
|
|
|
70
70
|
|
|
71
71
|
template<typename A>
|
|
72
72
|
bool u32_table<A>::maybe_insert(uint32_t item) {
|
|
73
|
-
const
|
|
73
|
+
const uint32_t index = lookup(item);
|
|
74
74
|
if (slots[index] == item) return false;
|
|
75
75
|
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
|
76
76
|
slots[index] = item;
|
|
@@ -83,7 +83,7 @@ bool u32_table<A>::maybe_insert(uint32_t item) {
|
|
|
83
83
|
|
|
84
84
|
template<typename A>
|
|
85
85
|
bool u32_table<A>::maybe_delete(uint32_t item) {
|
|
86
|
-
const
|
|
86
|
+
const uint32_t index = lookup(item);
|
|
87
87
|
if (slots[index] == UINT32_MAX) return false;
|
|
88
88
|
if (slots[index] != item) throw std::logic_error("item does not exist");
|
|
89
89
|
if (num_items == 0) throw std::logic_error("delete error");
|
|
@@ -110,7 +110,7 @@ bool u32_table<A>::maybe_delete(uint32_t item) {
|
|
|
110
110
|
|
|
111
111
|
// this one is specifically tailored to be a part of fm85 decompression scheme
|
|
112
112
|
template<typename A>
|
|
113
|
-
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs,
|
|
113
|
+
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator) {
|
|
114
114
|
uint8_t lg_num_slots = 2;
|
|
115
115
|
while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
|
|
116
116
|
u32_table<A> table(lg_num_slots, 6 + lg_k, allocator);
|
|
@@ -124,11 +124,11 @@ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pai
|
|
|
124
124
|
}
|
|
125
125
|
|
|
126
126
|
template<typename A>
|
|
127
|
-
|
|
128
|
-
const
|
|
129
|
-
const
|
|
127
|
+
uint32_t u32_table<A>::lookup(uint32_t item) const {
|
|
128
|
+
const uint32_t size = 1 << lg_size;
|
|
129
|
+
const uint32_t mask = size - 1;
|
|
130
130
|
const uint8_t shift = num_valid_bits - lg_size;
|
|
131
|
-
|
|
131
|
+
uint32_t probe = item >> shift;
|
|
132
132
|
if (probe > mask) throw std::logic_error("probe out of range");
|
|
133
133
|
while (slots[probe] != item && slots[probe] != UINT32_MAX) {
|
|
134
134
|
probe = (probe + 1) & mask;
|
|
@@ -139,7 +139,7 @@ size_t u32_table<A>::lookup(uint32_t item) const {
|
|
|
139
139
|
// counts and resizing must be handled by the caller
|
|
140
140
|
template<typename A>
|
|
141
141
|
void u32_table<A>::must_insert(uint32_t item) {
|
|
142
|
-
const
|
|
142
|
+
const uint32_t index = lookup(item);
|
|
143
143
|
if (slots[index] == item) throw std::logic_error("item exists");
|
|
144
144
|
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
|
145
145
|
slots[index] = item;
|
|
@@ -148,13 +148,13 @@ void u32_table<A>::must_insert(uint32_t item) {
|
|
|
148
148
|
template<typename A>
|
|
149
149
|
void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
|
150
150
|
if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
|
|
151
|
-
const
|
|
152
|
-
const
|
|
151
|
+
const uint32_t old_size = 1 << lg_size;
|
|
152
|
+
const uint32_t new_size = 1 << new_lg_size;
|
|
153
153
|
if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
|
|
154
154
|
vector_u32<A> old_slots = std::move(slots);
|
|
155
155
|
slots = vector_u32<A>(new_size, UINT32_MAX, old_slots.get_allocator());
|
|
156
156
|
lg_size = new_lg_size;
|
|
157
|
-
for (
|
|
157
|
+
for (uint32_t i = 0; i < old_size; i++) {
|
|
158
158
|
if (old_slots[i] != UINT32_MAX) {
|
|
159
159
|
must_insert(old_slots[i]);
|
|
160
160
|
}
|
|
@@ -170,7 +170,7 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
|
|
170
170
|
template<typename A>
|
|
171
171
|
vector_u32<A> u32_table<A>::unwrapping_get_items() const {
|
|
172
172
|
if (num_items == 0) return vector_u32<A>(slots.get_allocator());
|
|
173
|
-
const
|
|
173
|
+
const uint32_t table_size = 1 << lg_size;
|
|
174
174
|
vector_u32<A> result(num_items, 0, slots.get_allocator());
|
|
175
175
|
size_t i = 0;
|
|
176
176
|
size_t l = 0;
|
|
@@ -27,38 +27,38 @@ namespace datasketches {
|
|
|
27
27
|
typedef u32_table<std::allocator<void>> table;
|
|
28
28
|
|
|
29
29
|
TEST_CASE("cpc sketch: compress and decompress pairs", "[cpc_sketch]") {
|
|
30
|
-
const
|
|
31
|
-
const
|
|
30
|
+
const size_t N = 200;
|
|
31
|
+
const size_t MAXWORDS = 1000;
|
|
32
32
|
|
|
33
33
|
HashState twoHashes;
|
|
34
34
|
uint32_t pairArray[N];
|
|
35
35
|
uint32_t pairArray2[N];
|
|
36
36
|
uint64_t value = 35538947; // some arbitrary starting value
|
|
37
37
|
const uint64_t golden64 = 0x9e3779b97f4a7c13ULL; // the golden ratio
|
|
38
|
-
for (
|
|
38
|
+
for (size_t i = 0; i < N; i++) {
|
|
39
39
|
MurmurHash3_x64_128(&value, sizeof(value), 0, twoHashes);
|
|
40
40
|
uint32_t rand = twoHashes.h1 & 0xffff;
|
|
41
41
|
pairArray[i] = rand;
|
|
42
42
|
value += golden64;
|
|
43
43
|
}
|
|
44
44
|
//table::knuth_shell_sort3(pairArray, 0, N - 1); // unsigned numerical sort
|
|
45
|
-
std::sort(pairArray,
|
|
45
|
+
std::sort(pairArray, pairArray + N);
|
|
46
46
|
uint32_t prev = UINT32_MAX;
|
|
47
|
-
|
|
48
|
-
for (
|
|
47
|
+
uint32_t nxt = 0;
|
|
48
|
+
for (size_t i = 0; i < N; i++) { // uniquify
|
|
49
49
|
if (pairArray[i] != prev) {
|
|
50
50
|
prev = pairArray[i];
|
|
51
51
|
pairArray[nxt++] = pairArray[i];
|
|
52
52
|
}
|
|
53
53
|
}
|
|
54
|
-
|
|
54
|
+
uint32_t numPairs = nxt;
|
|
55
55
|
|
|
56
56
|
uint32_t compressedWords[MAXWORDS];
|
|
57
57
|
|
|
58
|
-
for (
|
|
59
|
-
|
|
58
|
+
for (uint8_t numBaseBits = 0; numBaseBits <= 11; numBaseBits++) {
|
|
59
|
+
uint32_t numWordsWritten = get_compressor<std::allocator<void>>().low_level_compress_pairs(pairArray, numPairs, numBaseBits, compressedWords);
|
|
60
60
|
get_compressor<std::allocator<void>>().low_level_uncompress_pairs(pairArray2, numPairs, numBaseBits, compressedWords, numWordsWritten);
|
|
61
|
-
for (
|
|
61
|
+
for (size_t i = 0; i < numPairs; i++) {
|
|
62
62
|
REQUIRE(pairArray[i] == pairArray2[i]);
|
|
63
63
|
}
|
|
64
64
|
}
|
|
@@ -25,6 +25,7 @@
|
|
|
25
25
|
#include <catch.hpp>
|
|
26
26
|
|
|
27
27
|
#include "cpc_sketch.hpp"
|
|
28
|
+
#include "cpc_union.hpp"
|
|
28
29
|
#include "test_allocator.hpp"
|
|
29
30
|
|
|
30
31
|
namespace datasketches {
|
|
@@ -234,4 +235,20 @@ TEST_CASE("cpc sketch allocation: serialize deserialize sliding, bytes", "[cpc_s
|
|
|
234
235
|
REQUIRE(test_allocator_net_allocations == 0);
|
|
235
236
|
}
|
|
236
237
|
|
|
238
|
+
using cpc_union_test_alloc = cpc_union_alloc<test_allocator<uint8_t>>;
|
|
239
|
+
|
|
240
|
+
TEST_CASE("cpc sketch allocation: union") {
|
|
241
|
+
cpc_sketch_test_alloc s1(11, DEFAULT_SEED, 0);
|
|
242
|
+
s1.update(1);
|
|
243
|
+
|
|
244
|
+
cpc_sketch_test_alloc s2(11, DEFAULT_SEED, 0);
|
|
245
|
+
s2.update(2);
|
|
246
|
+
|
|
247
|
+
cpc_union_test_alloc u(11, DEFAULT_SEED, 0);
|
|
248
|
+
u.update(s1);
|
|
249
|
+
u.update(s2);
|
|
250
|
+
auto s3 = u.get_result();
|
|
251
|
+
REQUIRE_FALSE(s3.is_empty());
|
|
252
|
+
}
|
|
253
|
+
|
|
237
254
|
} /* namespace datasketches */
|
|
@@ -283,6 +283,26 @@ TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
|
|
|
283
283
|
REQUIRE(deserialized.validate());
|
|
284
284
|
}
|
|
285
285
|
|
|
286
|
+
TEST_CASE("cpc sketch: serialize deserialize sliding huge", "[cpc_sketch]") {
|
|
287
|
+
cpc_sketch sketch(26);
|
|
288
|
+
const int n = 10000000;
|
|
289
|
+
for (int i = 0; i < n; i++) sketch.update(i);
|
|
290
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.001));
|
|
291
|
+
auto bytes = sketch.serialize();
|
|
292
|
+
cpc_sketch deserialized = cpc_sketch::deserialize(bytes.data(), bytes.size());
|
|
293
|
+
REQUIRE(deserialized.is_empty() == sketch.is_empty());
|
|
294
|
+
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
295
|
+
REQUIRE(deserialized.validate());
|
|
296
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 7), std::out_of_range);
|
|
297
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 15), std::out_of_range);
|
|
298
|
+
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
299
|
+
|
|
300
|
+
// updating again with the same values should not change the sketch
|
|
301
|
+
for (int i = 0; i < n; i++) deserialized.update(i);
|
|
302
|
+
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
|
|
303
|
+
REQUIRE(deserialized.validate());
|
|
304
|
+
}
|
|
305
|
+
|
|
286
306
|
TEST_CASE("cpc sketch: copy", "[cpc_sketch]") {
|
|
287
307
|
cpc_sketch s1(11);
|
|
288
308
|
s1.update(1);
|
|
@@ -378,4 +398,9 @@ TEST_CASE("cpc sketch: update string equivalence", "[cpc_sketch]") {
|
|
|
378
398
|
REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
|
|
379
399
|
}
|
|
380
400
|
|
|
401
|
+
TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
|
|
402
|
+
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
|
|
403
|
+
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
|
|
404
|
+
}
|
|
405
|
+
|
|
381
406
|
} /* namespace datasketches */
|
|
@@ -32,23 +32,13 @@ target_include_directories(fi
|
|
|
32
32
|
target_link_libraries(fi INTERFACE common)
|
|
33
33
|
target_compile_features(fi INTERFACE cxx_std_11)
|
|
34
34
|
|
|
35
|
-
set(fi_HEADERS "")
|
|
36
|
-
list(APPEND fi_HEADERS "include/frequent_items_sketch.hpp")
|
|
37
|
-
list(APPEND fi_HEADERS "include/frequent_items_sketch_impl.hpp")
|
|
38
|
-
list(APPEND fi_HEADERS "include/reverse_purge_hash_map.hpp")
|
|
39
|
-
list(APPEND fi_HEADERS "include/reverse_purge_hash_map_impl.hpp")
|
|
40
|
-
|
|
41
35
|
install(TARGETS fi
|
|
42
36
|
EXPORT ${PROJECT_NAME}
|
|
43
37
|
)
|
|
44
38
|
|
|
45
|
-
install(FILES
|
|
39
|
+
install(FILES
|
|
40
|
+
include/frequent_items_sketch.hpp
|
|
41
|
+
include/frequent_items_sketch_impl.hpp
|
|
42
|
+
include/reverse_purge_hash_map.hpp
|
|
43
|
+
include/reverse_purge_hash_map_impl.hpp
|
|
46
44
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
47
|
-
|
|
48
|
-
target_sources(fi
|
|
49
|
-
INTERFACE
|
|
50
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch.hpp
|
|
51
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch_impl.hpp
|
|
52
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map.hpp
|
|
53
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map_impl.hpp
|
|
54
|
-
)
|