datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -110,14 +110,14 @@ public:
|
|
|
110
110
|
* @return the lower bound of the approximate Clopper-Pearson confidence interval for the
|
|
111
111
|
* unknown success probability.
|
|
112
112
|
*/
|
|
113
|
-
static inline double approximate_lower_bound_on_p(
|
|
113
|
+
static inline double approximate_lower_bound_on_p(uint64_t n, uint64_t k, double num_std_devs) {
|
|
114
114
|
check_inputs(n, k);
|
|
115
115
|
if (n == 0) { return 0.0; } // the coin was never flipped, so we know nothing
|
|
116
116
|
else if (k == 0) { return 0.0; }
|
|
117
117
|
else if (k == 1) { return (exact_lower_bound_on_p_k_eq_1(n, delta_of_num_stdevs(num_std_devs))); }
|
|
118
118
|
else if (k == n) { return (exact_lower_bound_on_p_k_eq_n(n, delta_of_num_stdevs(num_std_devs))); }
|
|
119
119
|
else {
|
|
120
|
-
double x = abramowitz_stegun_formula_26p5p22((n - k) + 1, k, (-1.0 * num_std_devs));
|
|
120
|
+
double x = abramowitz_stegun_formula_26p5p22((n - k) + 1.0, static_cast<double>(k), (-1.0 * num_std_devs));
|
|
121
121
|
return (1.0 - x); // which is p
|
|
122
122
|
}
|
|
123
123
|
}
|
|
@@ -145,18 +145,18 @@ public:
|
|
|
145
145
|
* @return the upper bound of the approximate Clopper-Pearson confidence interval for the
|
|
146
146
|
* unknown success probability.
|
|
147
147
|
*/
|
|
148
|
-
static inline double approximate_upper_bound_on_p(
|
|
148
|
+
static inline double approximate_upper_bound_on_p(uint64_t n, uint64_t k, double num_std_devs) {
|
|
149
149
|
check_inputs(n, k);
|
|
150
150
|
if (n == 0) { return 1.0; } // the coin was never flipped, so we know nothing
|
|
151
151
|
else if (k == n) { return 1.0; }
|
|
152
152
|
else if (k == (n - 1)) {
|
|
153
|
-
return (
|
|
153
|
+
return (exact_upper_bound_on_p_k_eq_minusone(n, delta_of_num_stdevs(num_std_devs)));
|
|
154
154
|
}
|
|
155
155
|
else if (k == 0) {
|
|
156
156
|
return (exact_upper_bound_on_p_k_eq_zero(n, delta_of_num_stdevs(num_std_devs)));
|
|
157
157
|
}
|
|
158
158
|
else {
|
|
159
|
-
double x = abramowitz_stegun_formula_26p5p22(n - k, k + 1, num_std_devs);
|
|
159
|
+
double x = abramowitz_stegun_formula_26p5p22(static_cast<double>(n - k), k + 1.0, num_std_devs);
|
|
160
160
|
return (1.0 - x); // which is p
|
|
161
161
|
}
|
|
162
162
|
}
|
|
@@ -167,7 +167,7 @@ public:
|
|
|
167
167
|
* @param k is the number of successes. Must be non-negative, and cannot exceed n.
|
|
168
168
|
* @return the estimate of the unknown binomial proportion.
|
|
169
169
|
*/
|
|
170
|
-
static inline double estimate_unknown_p(
|
|
170
|
+
static inline double estimate_unknown_p(uint64_t n, uint64_t k) {
|
|
171
171
|
check_inputs(n, k);
|
|
172
172
|
if (n == 0) { return 0.5; } // the coin was never flipped, so we know nothing
|
|
173
173
|
else { return ((double) k / (double) n); }
|
|
@@ -193,9 +193,7 @@ public:
|
|
|
193
193
|
}
|
|
194
194
|
|
|
195
195
|
private:
|
|
196
|
-
static inline void check_inputs(
|
|
197
|
-
if (n < 0) { throw std::invalid_argument("N must be non-negative"); }
|
|
198
|
-
if (k < 0) { throw std::invalid_argument("K must be non-negative"); }
|
|
196
|
+
static inline void check_inputs(uint64_t n, uint64_t k) {
|
|
199
197
|
if (k > n) { throw std::invalid_argument("K cannot exceed N"); }
|
|
200
198
|
}
|
|
201
199
|
|
|
@@ -251,8 +249,7 @@ private:
|
|
|
251
249
|
// and it is worth keeping it that way so that it will always be easy to verify
|
|
252
250
|
// that the formula was typed in correctly.
|
|
253
251
|
|
|
254
|
-
static inline double abramowitz_stegun_formula_26p5p22(double a, double b,
|
|
255
|
-
double yp) {
|
|
252
|
+
static inline double abramowitz_stegun_formula_26p5p22(double a, double b, double yp) {
|
|
256
253
|
const double b2m1 = (2.0 * b) - 1.0;
|
|
257
254
|
const double a2m1 = (2.0 * a) - 1.0;
|
|
258
255
|
const double lambda = ((yp * yp) - 3.0) / 6.0;
|
|
@@ -268,19 +265,19 @@ private:
|
|
|
268
265
|
|
|
269
266
|
// Formulas for some special cases.
|
|
270
267
|
|
|
271
|
-
static inline double exact_upper_bound_on_p_k_eq_zero(
|
|
268
|
+
static inline double exact_upper_bound_on_p_k_eq_zero(uint64_t n, double delta) {
|
|
272
269
|
return (1.0 - pow(delta, (1.0 / n)));
|
|
273
270
|
}
|
|
274
271
|
|
|
275
|
-
static inline double exact_lower_bound_on_p_k_eq_n(
|
|
272
|
+
static inline double exact_lower_bound_on_p_k_eq_n(uint64_t n, double delta) {
|
|
276
273
|
return (pow(delta, (1.0 / n)));
|
|
277
274
|
}
|
|
278
275
|
|
|
279
|
-
static inline double exact_lower_bound_on_p_k_eq_1(
|
|
276
|
+
static inline double exact_lower_bound_on_p_k_eq_1(uint64_t n, double delta) {
|
|
280
277
|
return (1.0 - pow((1.0 - delta), (1.0 / n)));
|
|
281
278
|
}
|
|
282
279
|
|
|
283
|
-
static inline double
|
|
280
|
+
static inline double exact_upper_bound_on_p_k_eq_minusone(uint64_t n, double delta) {
|
|
284
281
|
return (pow((1.0 - delta), (1.0 / n)));
|
|
285
282
|
}
|
|
286
283
|
|
|
@@ -23,11 +23,14 @@
|
|
|
23
23
|
#include <cstdint>
|
|
24
24
|
#include <string>
|
|
25
25
|
#include <memory>
|
|
26
|
+
#include <iostream>
|
|
26
27
|
|
|
27
28
|
namespace datasketches {
|
|
28
29
|
|
|
29
30
|
static const uint64_t DEFAULT_SEED = 9001;
|
|
30
31
|
|
|
32
|
+
enum resize_factor { X1 = 0, X2, X4, X8 };
|
|
33
|
+
|
|
31
34
|
template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
|
|
32
35
|
template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
|
|
33
36
|
|
|
@@ -46,6 +49,29 @@ constexpr uint8_t lg_size_from_count(uint32_t n, double load_factor) {
|
|
|
46
49
|
return log2(n) + ((n > static_cast<uint32_t>((1 << (log2(n) + 1)) * load_factor)) ? 2 : 1);
|
|
47
50
|
}
|
|
48
51
|
|
|
52
|
+
// stream helpers to hide casts
|
|
53
|
+
template<typename T>
|
|
54
|
+
static inline T read(std::istream& is) {
|
|
55
|
+
T value;
|
|
56
|
+
is.read(reinterpret_cast<char*>(&value), sizeof(T));
|
|
57
|
+
return value;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
template<typename T>
|
|
61
|
+
static inline void read(std::istream& is, T* ptr, size_t size_bytes) {
|
|
62
|
+
is.read(reinterpret_cast<char*>(ptr), size_bytes);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
template<typename T>
|
|
66
|
+
static inline void write(std::ostream& os, T& value) {
|
|
67
|
+
os.write(reinterpret_cast<const char*>(&value), sizeof(T));
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
template<typename T>
|
|
71
|
+
static inline void write(std::ostream& os, const T* ptr, size_t size_bytes) {
|
|
72
|
+
os.write(reinterpret_cast<const char*>(ptr), size_bytes);
|
|
73
|
+
}
|
|
74
|
+
|
|
49
75
|
} // namespace
|
|
50
76
|
|
|
51
77
|
#endif // _COMMON_DEFS_HPP_
|
|
@@ -38,29 +38,41 @@ fwd_type<T1, T2> conditional_forward(T2&& value) {
|
|
|
38
38
|
// Forward container as iterators
|
|
39
39
|
|
|
40
40
|
template<typename Container>
|
|
41
|
-
auto forward_begin(Container&& c) ->
|
|
42
|
-
|
|
41
|
+
auto forward_begin(Container&& c) -> typename std::enable_if<
|
|
42
|
+
std::is_lvalue_reference<Container>::value ||
|
|
43
|
+
std::is_same<typename std::remove_reference<Container>::type::const_iterator, decltype(c.begin())>::value,
|
|
44
|
+
decltype(c.begin())
|
|
45
|
+
>::type
|
|
43
46
|
{
|
|
44
47
|
return c.begin();
|
|
45
48
|
}
|
|
46
49
|
|
|
47
50
|
template<typename Container>
|
|
48
|
-
auto forward_begin(Container&& c) ->
|
|
49
|
-
|
|
51
|
+
auto forward_begin(Container&& c) -> typename std::enable_if<
|
|
52
|
+
!std::is_lvalue_reference<Container>::value &&
|
|
53
|
+
!std::is_same<typename std::remove_reference<Container>::type::const_iterator, decltype(c.begin())>::value,
|
|
54
|
+
decltype(std::make_move_iterator(c.begin()))
|
|
55
|
+
>::type
|
|
50
56
|
{
|
|
51
57
|
return std::make_move_iterator(c.begin());
|
|
52
58
|
}
|
|
53
59
|
|
|
54
60
|
template<typename Container>
|
|
55
|
-
auto forward_end(Container&& c) ->
|
|
56
|
-
|
|
61
|
+
auto forward_end(Container&& c) -> typename std::enable_if<
|
|
62
|
+
std::is_lvalue_reference<Container>::value ||
|
|
63
|
+
std::is_same<typename std::remove_reference<Container>::type::const_iterator, decltype(c.begin())>::value,
|
|
64
|
+
decltype(c.end())
|
|
65
|
+
>::type
|
|
57
66
|
{
|
|
58
67
|
return c.end();
|
|
59
68
|
}
|
|
60
69
|
|
|
61
70
|
template<typename Container>
|
|
62
|
-
auto forward_end(Container&& c) ->
|
|
63
|
-
|
|
71
|
+
auto forward_end(Container&& c) -> typename std::enable_if<
|
|
72
|
+
!std::is_lvalue_reference<Container>::value &&
|
|
73
|
+
!std::is_same<typename std::remove_reference<Container>::type::const_iterator, decltype(c.begin())>::value,
|
|
74
|
+
decltype(std::make_move_iterator(c.end()))
|
|
75
|
+
>::type
|
|
64
76
|
{
|
|
65
77
|
return std::make_move_iterator(c.end());
|
|
66
78
|
}
|
|
@@ -94,7 +94,7 @@ static inline uint8_t count_leading_zeros_in_u64(uint64_t input) {
|
|
|
94
94
|
static inline uint8_t count_trailing_zeros_in_u32(uint32_t input) {
|
|
95
95
|
for (int i = 0; i < 4; i++) {
|
|
96
96
|
const int byte = input & 0xff;
|
|
97
|
-
if (byte != 0) return (i << 3) + byte_trailing_zeros_table[byte];
|
|
97
|
+
if (byte != 0) return static_cast<uint8_t>((i << 3) + byte_trailing_zeros_table[byte]);
|
|
98
98
|
input >>= 8;
|
|
99
99
|
}
|
|
100
100
|
return 32;
|
|
@@ -103,7 +103,7 @@ static inline uint8_t count_trailing_zeros_in_u32(uint32_t input) {
|
|
|
103
103
|
static inline uint8_t count_trailing_zeros_in_u64(uint64_t input) {
|
|
104
104
|
for (int i = 0; i < 8; i++) {
|
|
105
105
|
const int byte = input & 0xff;
|
|
106
|
-
if (byte != 0) return (i << 3) + byte_trailing_zeros_table[byte];
|
|
106
|
+
if (byte != 0) return static_cast<uint8_t>((i << 3) + byte_trailing_zeros_table[byte]);
|
|
107
107
|
input >>= 8;
|
|
108
108
|
}
|
|
109
109
|
return 64;
|
|
@@ -51,7 +51,7 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
|
|
|
51
51
|
bool failure = false;
|
|
52
52
|
try {
|
|
53
53
|
os.write(reinterpret_cast<const char*>(items), sizeof(T) * num);
|
|
54
|
-
} catch (std::ostream::failure&
|
|
54
|
+
} catch (std::ostream::failure&) {
|
|
55
55
|
failure = true;
|
|
56
56
|
}
|
|
57
57
|
if (failure || !os.good()) {
|
|
@@ -62,7 +62,7 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
|
|
|
62
62
|
bool failure = false;
|
|
63
63
|
try {
|
|
64
64
|
is.read((char*)items, sizeof(T) * num);
|
|
65
|
-
} catch (std::istream::failure&
|
|
65
|
+
} catch (std::istream::failure&) {
|
|
66
66
|
failure = true;
|
|
67
67
|
}
|
|
68
68
|
if (failure || !is.good()) {
|
|
@@ -99,11 +99,11 @@ struct serde<std::string> {
|
|
|
99
99
|
bool failure = false;
|
|
100
100
|
try {
|
|
101
101
|
for (; i < num && os.good(); i++) {
|
|
102
|
-
uint32_t length = items[i].size();
|
|
102
|
+
uint32_t length = static_cast<uint32_t>(items[i].size());
|
|
103
103
|
os.write((char*)&length, sizeof(length));
|
|
104
104
|
os.write(items[i].c_str(), length);
|
|
105
105
|
}
|
|
106
|
-
} catch (std::ostream::failure&
|
|
106
|
+
} catch (std::ostream::failure&) {
|
|
107
107
|
failure = true;
|
|
108
108
|
}
|
|
109
109
|
if (failure || !os.good()) {
|
|
@@ -121,12 +121,12 @@ struct serde<std::string> {
|
|
|
121
121
|
std::string str;
|
|
122
122
|
str.reserve(length);
|
|
123
123
|
for (uint32_t j = 0; j < length; j++) {
|
|
124
|
-
str.push_back(is.get());
|
|
124
|
+
str.push_back(static_cast<char>(is.get()));
|
|
125
125
|
}
|
|
126
126
|
if (!is.good()) { break; }
|
|
127
127
|
new (&items[i]) std::string(std::move(str));
|
|
128
128
|
}
|
|
129
|
-
} catch (std::istream::failure&
|
|
129
|
+
} catch (std::istream::failure&) {
|
|
130
130
|
failure = true;
|
|
131
131
|
}
|
|
132
132
|
if (failure || !is.good()) {
|
|
@@ -143,7 +143,7 @@ struct serde<std::string> {
|
|
|
143
143
|
size_t serialize(void* ptr, size_t capacity, const std::string* items, unsigned num) const {
|
|
144
144
|
size_t bytes_written = 0;
|
|
145
145
|
for (unsigned i = 0; i < num; ++i) {
|
|
146
|
-
const uint32_t length = items[i].size();
|
|
146
|
+
const uint32_t length = static_cast<uint32_t>(items[i].size());
|
|
147
147
|
const size_t new_bytes = length + sizeof(length);
|
|
148
148
|
check_memory_size(bytes_written + new_bytes, capacity);
|
|
149
149
|
memcpy(ptr, &length, sizeof(length));
|
|
@@ -32,43 +32,23 @@ target_include_directories(cpc
|
|
|
32
32
|
target_link_libraries(cpc INTERFACE common)
|
|
33
33
|
target_compile_features(cpc INTERFACE cxx_std_11)
|
|
34
34
|
|
|
35
|
-
set(cpc_HEADERS "")
|
|
36
|
-
list(APPEND cpc_HEADERS "include/compression_data.hpp")
|
|
37
|
-
list(APPEND cpc_HEADERS "include/cpc_common.hpp")
|
|
38
|
-
list(APPEND cpc_HEADERS "include/cpc_compressor.hpp")
|
|
39
|
-
list(APPEND cpc_HEADERS "include/cpc_compressor_impl.hpp")
|
|
40
|
-
list(APPEND cpc_HEADERS "include/cpc_confidence.hpp")
|
|
41
|
-
list(APPEND cpc_HEADERS "include/cpc_sketch.hpp")
|
|
42
|
-
list(APPEND cpc_HEADERS "include/cpc_sketch_impl.hpp")
|
|
43
|
-
list(APPEND cpc_HEADERS "include/cpc_union.hpp")
|
|
44
|
-
list(APPEND cpc_HEADERS "include/cpc_union_impl.hpp")
|
|
45
|
-
list(APPEND cpc_HEADERS "include/cpc_util.hpp")
|
|
46
|
-
list(APPEND cpc_HEADERS "include/icon_estimator.hpp")
|
|
47
|
-
list(APPEND cpc_HEADERS "include/kxp_byte_lookup.hpp")
|
|
48
|
-
list(APPEND cpc_HEADERS "include/u32_table.hpp")
|
|
49
|
-
list(APPEND cpc_HEADERS "include/u32_table_impl.hpp")
|
|
50
|
-
|
|
51
35
|
install(TARGETS cpc
|
|
52
36
|
EXPORT ${PROJECT_NAME}
|
|
53
37
|
)
|
|
54
38
|
|
|
55
|
-
install(FILES
|
|
39
|
+
install(FILES
|
|
40
|
+
include/compression_data.hpp
|
|
41
|
+
include/cpc_common.hpp
|
|
42
|
+
include/cpc_compressor.hpp
|
|
43
|
+
include/cpc_compressor_impl.hpp
|
|
44
|
+
include/cpc_confidence.hpp
|
|
45
|
+
include/cpc_sketch.hpp
|
|
46
|
+
include/cpc_sketch_impl.hpp
|
|
47
|
+
include/cpc_union.hpp
|
|
48
|
+
include/cpc_union_impl.hpp
|
|
49
|
+
include/cpc_util.hpp
|
|
50
|
+
include/icon_estimator.hpp
|
|
51
|
+
include/kxp_byte_lookup.hpp
|
|
52
|
+
include/u32_table.hpp
|
|
53
|
+
include/u32_table_impl.hpp
|
|
56
54
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
57
|
-
|
|
58
|
-
target_sources(cpc
|
|
59
|
-
INTERFACE
|
|
60
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/compression_data.hpp
|
|
61
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_common.hpp
|
|
62
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_compressor.hpp
|
|
63
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_compressor_impl.hpp
|
|
64
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_confidence.hpp
|
|
65
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_sketch.hpp
|
|
66
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_sketch_impl.hpp
|
|
67
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_union.hpp
|
|
68
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_union_impl.hpp
|
|
69
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/cpc_util.hpp
|
|
70
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/icon_estimator.hpp
|
|
71
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kxp_byte_lookup.hpp
|
|
72
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/u32_table.hpp
|
|
73
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/u32_table_impl.hpp
|
|
74
|
-
)
|
|
@@ -26,9 +26,16 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
namespace cpc_constants {
|
|
30
|
+
const uint8_t MIN_LG_K = 4;
|
|
31
|
+
const uint8_t MAX_LG_K = 26;
|
|
32
|
+
const uint8_t DEFAULT_LG_K = 11;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
|
36
|
+
static const uint8_t CPC_MIN_LG_K = cpc_constants::MIN_LG_K;
|
|
37
|
+
static const uint8_t CPC_MAX_LG_K = cpc_constants::MAX_LG_K;
|
|
38
|
+
static const uint8_t CPC_DEFAULT_LG_K = cpc_constants::DEFAULT_LG_K;
|
|
32
39
|
|
|
33
40
|
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
34
41
|
template<typename A> using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
|
|
@@ -48,44 +48,44 @@ template<typename A>
|
|
|
48
48
|
class cpc_compressor {
|
|
49
49
|
public:
|
|
50
50
|
void compress(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
|
|
51
|
-
void uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k,
|
|
51
|
+
void uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
|
|
52
52
|
|
|
53
53
|
// methods below are public for testing
|
|
54
54
|
|
|
55
55
|
// This returns the number of compressed words that were actually used. It is the caller's
|
|
56
56
|
// responsibility to ensure that the compressed_words array is long enough to prevent over-run.
|
|
57
|
-
|
|
57
|
+
uint32_t low_level_compress_bytes(
|
|
58
58
|
const uint8_t* byte_array, // input
|
|
59
|
-
|
|
59
|
+
uint32_t num_bytes_to_encode,
|
|
60
60
|
const uint16_t* encoding_table,
|
|
61
61
|
uint32_t* compressed_words // output
|
|
62
62
|
) const;
|
|
63
63
|
|
|
64
64
|
void low_level_uncompress_bytes(
|
|
65
65
|
uint8_t* byte_array, // output
|
|
66
|
-
|
|
66
|
+
uint32_t num_bytes_to_decode,
|
|
67
67
|
const uint16_t* decoding_table,
|
|
68
68
|
const uint32_t* compressed_words,
|
|
69
|
-
|
|
69
|
+
uint32_t num_compressed_words // input
|
|
70
70
|
) const;
|
|
71
71
|
|
|
72
72
|
// Here "pairs" refers to row-column pairs that specify
|
|
73
73
|
// the positions of surprising values in the bit matrix.
|
|
74
74
|
|
|
75
75
|
// returns the number of compressedWords actually used
|
|
76
|
-
|
|
76
|
+
uint32_t low_level_compress_pairs(
|
|
77
77
|
const uint32_t* pair_array, // input
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
uint32_t num_pairs_to_encode,
|
|
79
|
+
uint8_t num_base_bits,
|
|
80
80
|
uint32_t* compressed_words // output
|
|
81
81
|
) const;
|
|
82
82
|
|
|
83
83
|
void low_level_uncompress_pairs(
|
|
84
84
|
uint32_t* pair_array, // output
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
uint32_t num_pairs_to_decode,
|
|
86
|
+
uint8_t num_base_bits,
|
|
87
87
|
const uint32_t* compressed_words, // input
|
|
88
|
-
|
|
88
|
+
uint32_t num_compressed_words // input
|
|
89
89
|
) const;
|
|
90
90
|
|
|
91
91
|
private:
|
|
@@ -122,22 +122,22 @@ private:
|
|
|
122
122
|
void uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
|
|
123
123
|
void uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
|
|
124
124
|
|
|
125
|
-
uint8_t* make_inverse_permutation(const uint8_t* permu,
|
|
126
|
-
uint16_t* make_decoding_table(const uint16_t* encoding_table,
|
|
125
|
+
uint8_t* make_inverse_permutation(const uint8_t* permu, unsigned length);
|
|
126
|
+
uint16_t* make_decoding_table(const uint16_t* encoding_table, unsigned num_byte_values);
|
|
127
127
|
void validate_decoding_table(const uint16_t* decoding_table, const uint16_t* encoding_table) const;
|
|
128
128
|
|
|
129
129
|
void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
|
|
130
130
|
void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
|
|
131
131
|
|
|
132
|
-
vector_u32<A> uncompress_surprising_values(const uint32_t* data,
|
|
133
|
-
void uncompress_sliding_window(const uint32_t* data,
|
|
132
|
+
vector_u32<A> uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs, uint8_t lg_k, const A& allocator) const;
|
|
133
|
+
void uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
|
|
134
134
|
|
|
135
|
-
static size_t safe_length_for_compressed_pair_buf(
|
|
136
|
-
static size_t safe_length_for_compressed_window_buf(
|
|
137
|
-
static uint8_t determine_pseudo_phase(uint8_t lg_k,
|
|
135
|
+
static size_t safe_length_for_compressed_pair_buf(uint32_t k, uint32_t num_pairs, uint8_t num_base_bits);
|
|
136
|
+
static size_t safe_length_for_compressed_window_buf(uint32_t k);
|
|
137
|
+
static uint8_t determine_pseudo_phase(uint8_t lg_k, uint32_t c);
|
|
138
138
|
|
|
139
139
|
static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
|
|
140
|
-
static inline
|
|
140
|
+
static inline uint8_t golomb_choose_number_of_base_bits(uint32_t k, uint64_t count);
|
|
141
141
|
};
|
|
142
142
|
|
|
143
143
|
} /* namespace datasketches */
|