datasketches 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5c578044053c564421893cc4433f7fe557f23ba9d8a1995fc2a2c5f07742721a
|
|
4
|
+
data.tar.gz: f4122bd75e19fede015b01a5e5ad8e6130f75babe9c9160cc56f378480a16cee
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2d7c4d7306f28356557a816a78033b909561ccd8f843281a2b756e88cbdcb9936da7995ff80871a19e229675ead812aca00d6c639d63a6532998c3c1b35aa953
|
|
7
|
+
data.tar.gz: fdf0fe1d14e04bfddef9df1ae7958f6571a7f689865aa02e81713d4b250afeeeb8c90a168ce855728d9c831aff6d3ea71df91c71b7269a760c19488c42c92658
|
data/CHANGELOG.md
CHANGED
data/lib/datasketches/version.rb
CHANGED
|
@@ -70,6 +70,13 @@ if(COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
|
|
|
70
70
|
add_link_options(--coverage)
|
|
71
71
|
endif()
|
|
72
72
|
|
|
73
|
+
option(SANITIZE "Run sanitization checks (g++/clang only)" OFF)
|
|
74
|
+
if(SANITIZE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
|
|
75
|
+
add_compile_options(-fsanitize=${SANITIZE})
|
|
76
|
+
add_link_options(-fsanitize=${SANITIZE})
|
|
77
|
+
endif()
|
|
78
|
+
|
|
79
|
+
|
|
73
80
|
# set default build type to Release
|
|
74
81
|
# Derived from: https://blog.kitware.com/cmake-and-the-default-build-type/
|
|
75
82
|
set(default_build_type "Release")
|
|
@@ -16,6 +16,8 @@
|
|
|
16
16
|
#ifndef _MURMURHASH3_H_
|
|
17
17
|
#define _MURMURHASH3_H_
|
|
18
18
|
|
|
19
|
+
#include <cstring>
|
|
20
|
+
|
|
19
21
|
//-----------------------------------------------------------------------------
|
|
20
22
|
// Platform-specific functions and macros
|
|
21
23
|
|
|
@@ -76,9 +78,11 @@ typedef struct {
|
|
|
76
78
|
// Block read - if your platform needs to do endian-swapping or can only
|
|
77
79
|
// handle aligned reads, do the conversion here
|
|
78
80
|
|
|
79
|
-
FORCE_INLINE uint64_t getblock64 ( const uint64_t * p,
|
|
81
|
+
FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, size_t i )
|
|
80
82
|
{
|
|
81
|
-
|
|
83
|
+
uint64_t res;
|
|
84
|
+
memcpy(&res, p + i, sizeof(res));
|
|
85
|
+
return res;
|
|
82
86
|
}
|
|
83
87
|
|
|
84
88
|
//-----------------------------------------------------------------------------
|
|
@@ -95,7 +99,7 @@ FORCE_INLINE uint64_t fmix64 ( uint64_t k )
|
|
|
95
99
|
return k;
|
|
96
100
|
}
|
|
97
101
|
|
|
98
|
-
FORCE_INLINE void MurmurHash3_x64_128(const void* key,
|
|
102
|
+
FORCE_INLINE void MurmurHash3_x64_128(const void* key, size_t lenBytes, uint64_t seed, HashState& out) {
|
|
99
103
|
static const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
|
|
100
104
|
static const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
|
|
101
105
|
|
|
@@ -106,13 +110,13 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
|
|
|
106
110
|
|
|
107
111
|
// Number of full 128-bit blocks of 16 bytes.
|
|
108
112
|
// Possible exclusion of a remainder of up to 15 bytes.
|
|
109
|
-
const
|
|
113
|
+
const size_t nblocks = lenBytes >> 4; // bytes / 16
|
|
110
114
|
|
|
111
115
|
// Process the 128-bit blocks (the body) into the hash
|
|
112
116
|
const uint64_t* blocks = (const uint64_t*)(data);
|
|
113
|
-
for (
|
|
114
|
-
uint64_t k1 = getblock64(blocks,i*2+0);
|
|
115
|
-
uint64_t k2 = getblock64(blocks,i*2+1);
|
|
117
|
+
for (size_t i = 0; i < nblocks; ++i) { // 16 bytes per block
|
|
118
|
+
uint64_t k1 = getblock64(blocks, i * 2 + 0);
|
|
119
|
+
uint64_t k2 = getblock64(blocks, i * 2 + 1);
|
|
116
120
|
|
|
117
121
|
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; out.h1 ^= k1;
|
|
118
122
|
out.h1 = ROTL64(out.h1,27);
|
|
@@ -381,7 +381,7 @@ private:
|
|
|
381
381
|
// The following computes an approximation to the lower bound of a Frequentist
|
|
382
382
|
// confidence interval based on the tails of the Binomial distribution.
|
|
383
383
|
static double compute_approx_binomial_lower_bound(unsigned long long num_samples, double theta, unsigned num_std_devs) {
|
|
384
|
-
if (theta == 1) return num_samples;
|
|
384
|
+
if (theta == 1) return static_cast<double>(num_samples);
|
|
385
385
|
if (num_samples == 0) return 0;
|
|
386
386
|
if (num_samples == 1) {
|
|
387
387
|
const double delta = delta_of_num_std_devs[num_std_devs];
|
|
@@ -395,24 +395,24 @@ private:
|
|
|
395
395
|
}
|
|
396
396
|
// at this point we know 2 <= num_samples <= 120
|
|
397
397
|
if (theta > (1 - 1e-5)) { // empirically-determined threshold
|
|
398
|
-
return num_samples;
|
|
398
|
+
return static_cast<double>(num_samples);
|
|
399
399
|
}
|
|
400
400
|
if (theta < (num_samples / 360.0)) { // empirically-determined threshold
|
|
401
401
|
// here we use the Gaussian approximation, but with a modified num_std_devs
|
|
402
|
-
const unsigned index = 3 * num_samples + (num_std_devs - 1);
|
|
402
|
+
const unsigned index = 3 * static_cast<unsigned>(num_samples) + (num_std_devs - 1);
|
|
403
403
|
const double raw_lb = cont_classic_lb(num_samples, theta, lb_equiv_table[index]);
|
|
404
404
|
return raw_lb - 0.5; // fake round down
|
|
405
405
|
}
|
|
406
406
|
// This is the most difficult range to approximate; we will compute an "exact" LB.
|
|
407
407
|
// We know that est <= 360, so specialNStar() shouldn't be ridiculously slow.
|
|
408
408
|
const double delta = delta_of_num_std_devs[num_std_devs];
|
|
409
|
-
return special_n_star(num_samples, theta, delta); // no need to round
|
|
409
|
+
return static_cast<double>(special_n_star(num_samples, theta, delta)); // no need to round
|
|
410
410
|
}
|
|
411
411
|
|
|
412
412
|
// The following computes an approximation to the upper bound of a Frequentist
|
|
413
413
|
// confidence interval based on the tails of the Binomial distribution.
|
|
414
414
|
static double compute_approx_binomial_upper_bound(unsigned long long num_samples, double theta, unsigned num_std_devs) {
|
|
415
|
-
if (theta == 1) return num_samples;
|
|
415
|
+
if (theta == 1) return static_cast<double>(num_samples);
|
|
416
416
|
if (num_samples == 0) {
|
|
417
417
|
const double delta = delta_of_num_std_devs[num_std_devs];
|
|
418
418
|
const double raw_ub = std::log(delta) / std::log(1 - theta);
|
|
@@ -425,18 +425,18 @@ private:
|
|
|
425
425
|
}
|
|
426
426
|
// at this point we know 2 <= num_samples <= 120
|
|
427
427
|
if (theta > (1 - 1e-5)) { // empirically-determined threshold
|
|
428
|
-
return num_samples + 1;
|
|
428
|
+
return static_cast<double>(num_samples + 1);
|
|
429
429
|
}
|
|
430
430
|
if (theta < (num_samples / 360.0)) { // empirically-determined threshold
|
|
431
431
|
// here we use the Gaussian approximation, but with a modified num_std_devs
|
|
432
|
-
const unsigned index = 3 * num_samples + (num_std_devs - 1);
|
|
432
|
+
const unsigned index = 3 * static_cast<unsigned>(num_samples) + (num_std_devs - 1);
|
|
433
433
|
const double raw_ub = cont_classic_ub(num_samples, theta, ub_equiv_table[index]);
|
|
434
434
|
return raw_ub + 0.5; // fake round up
|
|
435
435
|
}
|
|
436
436
|
// This is the most difficult range to approximate; we will compute an "exact" UB.
|
|
437
437
|
// We know that est <= 360, so specialNPrimeF() shouldn't be ridiculously slow.
|
|
438
438
|
const double delta = delta_of_num_std_devs[num_std_devs];
|
|
439
|
-
return special_n_prime_f(num_samples, theta, delta); // no need to round
|
|
439
|
+
return static_cast<double>(special_n_prime_f(num_samples, theta, delta)); // no need to round
|
|
440
440
|
}
|
|
441
441
|
|
|
442
442
|
static void check_theta(double theta) {
|
|
@@ -110,14 +110,14 @@ public:
|
|
|
110
110
|
* @return the lower bound of the approximate Clopper-Pearson confidence interval for the
|
|
111
111
|
* unknown success probability.
|
|
112
112
|
*/
|
|
113
|
-
static inline double approximate_lower_bound_on_p(
|
|
113
|
+
static inline double approximate_lower_bound_on_p(uint64_t n, uint64_t k, double num_std_devs) {
|
|
114
114
|
check_inputs(n, k);
|
|
115
115
|
if (n == 0) { return 0.0; } // the coin was never flipped, so we know nothing
|
|
116
116
|
else if (k == 0) { return 0.0; }
|
|
117
117
|
else if (k == 1) { return (exact_lower_bound_on_p_k_eq_1(n, delta_of_num_stdevs(num_std_devs))); }
|
|
118
118
|
else if (k == n) { return (exact_lower_bound_on_p_k_eq_n(n, delta_of_num_stdevs(num_std_devs))); }
|
|
119
119
|
else {
|
|
120
|
-
double x = abramowitz_stegun_formula_26p5p22((n - k) + 1, k, (-1.0 * num_std_devs));
|
|
120
|
+
double x = abramowitz_stegun_formula_26p5p22((n - k) + 1.0, static_cast<double>(k), (-1.0 * num_std_devs));
|
|
121
121
|
return (1.0 - x); // which is p
|
|
122
122
|
}
|
|
123
123
|
}
|
|
@@ -145,18 +145,18 @@ public:
|
|
|
145
145
|
* @return the upper bound of the approximate Clopper-Pearson confidence interval for the
|
|
146
146
|
* unknown success probability.
|
|
147
147
|
*/
|
|
148
|
-
static inline double approximate_upper_bound_on_p(
|
|
148
|
+
static inline double approximate_upper_bound_on_p(uint64_t n, uint64_t k, double num_std_devs) {
|
|
149
149
|
check_inputs(n, k);
|
|
150
150
|
if (n == 0) { return 1.0; } // the coin was never flipped, so we know nothing
|
|
151
151
|
else if (k == n) { return 1.0; }
|
|
152
152
|
else if (k == (n - 1)) {
|
|
153
|
-
return (
|
|
153
|
+
return (exact_upper_bound_on_p_k_eq_minusone(n, delta_of_num_stdevs(num_std_devs)));
|
|
154
154
|
}
|
|
155
155
|
else if (k == 0) {
|
|
156
156
|
return (exact_upper_bound_on_p_k_eq_zero(n, delta_of_num_stdevs(num_std_devs)));
|
|
157
157
|
}
|
|
158
158
|
else {
|
|
159
|
-
double x = abramowitz_stegun_formula_26p5p22(n - k, k + 1, num_std_devs);
|
|
159
|
+
double x = abramowitz_stegun_formula_26p5p22(static_cast<double>(n - k), k + 1.0, num_std_devs);
|
|
160
160
|
return (1.0 - x); // which is p
|
|
161
161
|
}
|
|
162
162
|
}
|
|
@@ -167,7 +167,7 @@ public:
|
|
|
167
167
|
* @param k is the number of successes. Must be non-negative, and cannot exceed n.
|
|
168
168
|
* @return the estimate of the unknown binomial proportion.
|
|
169
169
|
*/
|
|
170
|
-
static inline double estimate_unknown_p(
|
|
170
|
+
static inline double estimate_unknown_p(uint64_t n, uint64_t k) {
|
|
171
171
|
check_inputs(n, k);
|
|
172
172
|
if (n == 0) { return 0.5; } // the coin was never flipped, so we know nothing
|
|
173
173
|
else { return ((double) k / (double) n); }
|
|
@@ -193,9 +193,7 @@ public:
|
|
|
193
193
|
}
|
|
194
194
|
|
|
195
195
|
private:
|
|
196
|
-
static inline void check_inputs(
|
|
197
|
-
if (n < 0) { throw std::invalid_argument("N must be non-negative"); }
|
|
198
|
-
if (k < 0) { throw std::invalid_argument("K must be non-negative"); }
|
|
196
|
+
static inline void check_inputs(uint64_t n, uint64_t k) {
|
|
199
197
|
if (k > n) { throw std::invalid_argument("K cannot exceed N"); }
|
|
200
198
|
}
|
|
201
199
|
|
|
@@ -251,8 +249,7 @@ private:
|
|
|
251
249
|
// and it is worth keeping it that way so that it will always be easy to verify
|
|
252
250
|
// that the formula was typed in correctly.
|
|
253
251
|
|
|
254
|
-
static inline double abramowitz_stegun_formula_26p5p22(double a, double b,
|
|
255
|
-
double yp) {
|
|
252
|
+
static inline double abramowitz_stegun_formula_26p5p22(double a, double b, double yp) {
|
|
256
253
|
const double b2m1 = (2.0 * b) - 1.0;
|
|
257
254
|
const double a2m1 = (2.0 * a) - 1.0;
|
|
258
255
|
const double lambda = ((yp * yp) - 3.0) / 6.0;
|
|
@@ -268,19 +265,19 @@ private:
|
|
|
268
265
|
|
|
269
266
|
// Formulas for some special cases.
|
|
270
267
|
|
|
271
|
-
static inline double exact_upper_bound_on_p_k_eq_zero(
|
|
268
|
+
static inline double exact_upper_bound_on_p_k_eq_zero(uint64_t n, double delta) {
|
|
272
269
|
return (1.0 - pow(delta, (1.0 / n)));
|
|
273
270
|
}
|
|
274
271
|
|
|
275
|
-
static inline double exact_lower_bound_on_p_k_eq_n(
|
|
272
|
+
static inline double exact_lower_bound_on_p_k_eq_n(uint64_t n, double delta) {
|
|
276
273
|
return (pow(delta, (1.0 / n)));
|
|
277
274
|
}
|
|
278
275
|
|
|
279
|
-
static inline double exact_lower_bound_on_p_k_eq_1(
|
|
276
|
+
static inline double exact_lower_bound_on_p_k_eq_1(uint64_t n, double delta) {
|
|
280
277
|
return (1.0 - pow((1.0 - delta), (1.0 / n)));
|
|
281
278
|
}
|
|
282
279
|
|
|
283
|
-
static inline double
|
|
280
|
+
static inline double exact_upper_bound_on_p_k_eq_minusone(uint64_t n, double delta) {
|
|
284
281
|
return (pow((1.0 - delta), (1.0 / n)));
|
|
285
282
|
}
|
|
286
283
|
|
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
#include <cstdint>
|
|
24
24
|
#include <string>
|
|
25
25
|
#include <memory>
|
|
26
|
+
#include <iostream>
|
|
26
27
|
|
|
27
28
|
namespace datasketches {
|
|
28
29
|
|
|
@@ -46,6 +47,29 @@ constexpr uint8_t lg_size_from_count(uint32_t n, double load_factor) {
|
|
|
46
47
|
return log2(n) + ((n > static_cast<uint32_t>((1 << (log2(n) + 1)) * load_factor)) ? 2 : 1);
|
|
47
48
|
}
|
|
48
49
|
|
|
50
|
+
// stream helpers to hide casts
|
|
51
|
+
template<typename T>
|
|
52
|
+
static inline T read(std::istream& is) {
|
|
53
|
+
T value;
|
|
54
|
+
is.read(reinterpret_cast<char*>(&value), sizeof(T));
|
|
55
|
+
return value;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
template<typename T>
|
|
59
|
+
static inline void read(std::istream& is, T* ptr, size_t size_bytes) {
|
|
60
|
+
is.read(reinterpret_cast<char*>(ptr), size_bytes);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
template<typename T>
|
|
64
|
+
static inline void write(std::ostream& os, T& value) {
|
|
65
|
+
os.write(reinterpret_cast<const char*>(&value), sizeof(T));
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
template<typename T>
|
|
69
|
+
static inline void write(std::ostream& os, const T* ptr, size_t size_bytes) {
|
|
70
|
+
os.write(reinterpret_cast<const char*>(ptr), size_bytes);
|
|
71
|
+
}
|
|
72
|
+
|
|
49
73
|
} // namespace
|
|
50
74
|
|
|
51
75
|
#endif // _COMMON_DEFS_HPP_
|
|
@@ -38,29 +38,41 @@ fwd_type<T1, T2> conditional_forward(T2&& value) {
|
|
|
38
38
|
// Forward container as iterators
|
|
39
39
|
|
|
40
40
|
template<typename Container>
|
|
41
|
-
auto forward_begin(Container&& c) ->
|
|
42
|
-
|
|
41
|
+
auto forward_begin(Container&& c) -> typename std::enable_if<
|
|
42
|
+
std::is_lvalue_reference<Container>::value ||
|
|
43
|
+
std::is_same<typename std::remove_reference<Container>::type::const_iterator, decltype(c.begin())>::value,
|
|
44
|
+
decltype(c.begin())
|
|
45
|
+
>::type
|
|
43
46
|
{
|
|
44
47
|
return c.begin();
|
|
45
48
|
}
|
|
46
49
|
|
|
47
50
|
template<typename Container>
|
|
48
|
-
auto forward_begin(Container&& c) ->
|
|
49
|
-
|
|
51
|
+
auto forward_begin(Container&& c) -> typename std::enable_if<
|
|
52
|
+
!std::is_lvalue_reference<Container>::value &&
|
|
53
|
+
!std::is_same<typename std::remove_reference<Container>::type::const_iterator, decltype(c.begin())>::value,
|
|
54
|
+
decltype(std::make_move_iterator(c.begin()))
|
|
55
|
+
>::type
|
|
50
56
|
{
|
|
51
57
|
return std::make_move_iterator(c.begin());
|
|
52
58
|
}
|
|
53
59
|
|
|
54
60
|
template<typename Container>
|
|
55
|
-
auto forward_end(Container&& c) ->
|
|
56
|
-
|
|
61
|
+
auto forward_end(Container&& c) -> typename std::enable_if<
|
|
62
|
+
std::is_lvalue_reference<Container>::value ||
|
|
63
|
+
std::is_same<typename std::remove_reference<Container>::type::const_iterator, decltype(c.begin())>::value,
|
|
64
|
+
decltype(c.end())
|
|
65
|
+
>::type
|
|
57
66
|
{
|
|
58
67
|
return c.end();
|
|
59
68
|
}
|
|
60
69
|
|
|
61
70
|
template<typename Container>
|
|
62
|
-
auto forward_end(Container&& c) ->
|
|
63
|
-
|
|
71
|
+
auto forward_end(Container&& c) -> typename std::enable_if<
|
|
72
|
+
!std::is_lvalue_reference<Container>::value &&
|
|
73
|
+
!std::is_same<typename std::remove_reference<Container>::type::const_iterator, decltype(c.begin())>::value,
|
|
74
|
+
decltype(std::make_move_iterator(c.end()))
|
|
75
|
+
>::type
|
|
64
76
|
{
|
|
65
77
|
return std::make_move_iterator(c.end());
|
|
66
78
|
}
|
|
@@ -94,7 +94,7 @@ static inline uint8_t count_leading_zeros_in_u64(uint64_t input) {
|
|
|
94
94
|
static inline uint8_t count_trailing_zeros_in_u32(uint32_t input) {
|
|
95
95
|
for (int i = 0; i < 4; i++) {
|
|
96
96
|
const int byte = input & 0xff;
|
|
97
|
-
if (byte != 0) return (i << 3) + byte_trailing_zeros_table[byte];
|
|
97
|
+
if (byte != 0) return static_cast<uint8_t>((i << 3) + byte_trailing_zeros_table[byte]);
|
|
98
98
|
input >>= 8;
|
|
99
99
|
}
|
|
100
100
|
return 32;
|
|
@@ -103,7 +103,7 @@ static inline uint8_t count_trailing_zeros_in_u32(uint32_t input) {
|
|
|
103
103
|
static inline uint8_t count_trailing_zeros_in_u64(uint64_t input) {
|
|
104
104
|
for (int i = 0; i < 8; i++) {
|
|
105
105
|
const int byte = input & 0xff;
|
|
106
|
-
if (byte != 0) return (i << 3) + byte_trailing_zeros_table[byte];
|
|
106
|
+
if (byte != 0) return static_cast<uint8_t>((i << 3) + byte_trailing_zeros_table[byte]);
|
|
107
107
|
input >>= 8;
|
|
108
108
|
}
|
|
109
109
|
return 64;
|
|
@@ -51,7 +51,7 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
|
|
|
51
51
|
bool failure = false;
|
|
52
52
|
try {
|
|
53
53
|
os.write(reinterpret_cast<const char*>(items), sizeof(T) * num);
|
|
54
|
-
} catch (std::ostream::failure&
|
|
54
|
+
} catch (std::ostream::failure&) {
|
|
55
55
|
failure = true;
|
|
56
56
|
}
|
|
57
57
|
if (failure || !os.good()) {
|
|
@@ -62,7 +62,7 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
|
|
|
62
62
|
bool failure = false;
|
|
63
63
|
try {
|
|
64
64
|
is.read((char*)items, sizeof(T) * num);
|
|
65
|
-
} catch (std::istream::failure&
|
|
65
|
+
} catch (std::istream::failure&) {
|
|
66
66
|
failure = true;
|
|
67
67
|
}
|
|
68
68
|
if (failure || !is.good()) {
|
|
@@ -99,11 +99,11 @@ struct serde<std::string> {
|
|
|
99
99
|
bool failure = false;
|
|
100
100
|
try {
|
|
101
101
|
for (; i < num && os.good(); i++) {
|
|
102
|
-
uint32_t length = items[i].size();
|
|
102
|
+
uint32_t length = static_cast<uint32_t>(items[i].size());
|
|
103
103
|
os.write((char*)&length, sizeof(length));
|
|
104
104
|
os.write(items[i].c_str(), length);
|
|
105
105
|
}
|
|
106
|
-
} catch (std::ostream::failure&
|
|
106
|
+
} catch (std::ostream::failure&) {
|
|
107
107
|
failure = true;
|
|
108
108
|
}
|
|
109
109
|
if (failure || !os.good()) {
|
|
@@ -121,12 +121,12 @@ struct serde<std::string> {
|
|
|
121
121
|
std::string str;
|
|
122
122
|
str.reserve(length);
|
|
123
123
|
for (uint32_t j = 0; j < length; j++) {
|
|
124
|
-
str.push_back(is.get());
|
|
124
|
+
str.push_back(static_cast<char>(is.get()));
|
|
125
125
|
}
|
|
126
126
|
if (!is.good()) { break; }
|
|
127
127
|
new (&items[i]) std::string(std::move(str));
|
|
128
128
|
}
|
|
129
|
-
} catch (std::istream::failure&
|
|
129
|
+
} catch (std::istream::failure&) {
|
|
130
130
|
failure = true;
|
|
131
131
|
}
|
|
132
132
|
if (failure || !is.good()) {
|
|
@@ -143,7 +143,7 @@ struct serde<std::string> {
|
|
|
143
143
|
size_t serialize(void* ptr, size_t capacity, const std::string* items, unsigned num) const {
|
|
144
144
|
size_t bytes_written = 0;
|
|
145
145
|
for (unsigned i = 0; i < num; ++i) {
|
|
146
|
-
const uint32_t length = items[i].size();
|
|
146
|
+
const uint32_t length = static_cast<uint32_t>(items[i].size());
|
|
147
147
|
const size_t new_bytes = length + sizeof(length);
|
|
148
148
|
check_memory_size(bytes_written + new_bytes, capacity);
|
|
149
149
|
memcpy(ptr, &length, sizeof(length));
|
|
@@ -48,44 +48,44 @@ template<typename A>
|
|
|
48
48
|
class cpc_compressor {
|
|
49
49
|
public:
|
|
50
50
|
void compress(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
|
|
51
|
-
void uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k,
|
|
51
|
+
void uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
|
|
52
52
|
|
|
53
53
|
// methods below are public for testing
|
|
54
54
|
|
|
55
55
|
// This returns the number of compressed words that were actually used. It is the caller's
|
|
56
56
|
// responsibility to ensure that the compressed_words array is long enough to prevent over-run.
|
|
57
|
-
|
|
57
|
+
uint32_t low_level_compress_bytes(
|
|
58
58
|
const uint8_t* byte_array, // input
|
|
59
|
-
|
|
59
|
+
uint32_t num_bytes_to_encode,
|
|
60
60
|
const uint16_t* encoding_table,
|
|
61
61
|
uint32_t* compressed_words // output
|
|
62
62
|
) const;
|
|
63
63
|
|
|
64
64
|
void low_level_uncompress_bytes(
|
|
65
65
|
uint8_t* byte_array, // output
|
|
66
|
-
|
|
66
|
+
uint32_t num_bytes_to_decode,
|
|
67
67
|
const uint16_t* decoding_table,
|
|
68
68
|
const uint32_t* compressed_words,
|
|
69
|
-
|
|
69
|
+
uint32_t num_compressed_words // input
|
|
70
70
|
) const;
|
|
71
71
|
|
|
72
72
|
// Here "pairs" refers to row-column pairs that specify
|
|
73
73
|
// the positions of surprising values in the bit matrix.
|
|
74
74
|
|
|
75
75
|
// returns the number of compressedWords actually used
|
|
76
|
-
|
|
76
|
+
uint32_t low_level_compress_pairs(
|
|
77
77
|
const uint32_t* pair_array, // input
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
uint32_t num_pairs_to_encode,
|
|
79
|
+
uint8_t num_base_bits,
|
|
80
80
|
uint32_t* compressed_words // output
|
|
81
81
|
) const;
|
|
82
82
|
|
|
83
83
|
void low_level_uncompress_pairs(
|
|
84
84
|
uint32_t* pair_array, // output
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
uint32_t num_pairs_to_decode,
|
|
86
|
+
uint8_t num_base_bits,
|
|
87
87
|
const uint32_t* compressed_words, // input
|
|
88
|
-
|
|
88
|
+
uint32_t num_compressed_words // input
|
|
89
89
|
) const;
|
|
90
90
|
|
|
91
91
|
private:
|
|
@@ -122,22 +122,22 @@ private:
|
|
|
122
122
|
void uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
|
|
123
123
|
void uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
|
|
124
124
|
|
|
125
|
-
uint8_t* make_inverse_permutation(const uint8_t* permu,
|
|
126
|
-
uint16_t* make_decoding_table(const uint16_t* encoding_table,
|
|
125
|
+
uint8_t* make_inverse_permutation(const uint8_t* permu, unsigned length);
|
|
126
|
+
uint16_t* make_decoding_table(const uint16_t* encoding_table, unsigned num_byte_values);
|
|
127
127
|
void validate_decoding_table(const uint16_t* decoding_table, const uint16_t* encoding_table) const;
|
|
128
128
|
|
|
129
129
|
void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
|
|
130
130
|
void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
|
|
131
131
|
|
|
132
|
-
vector_u32<A> uncompress_surprising_values(const uint32_t* data,
|
|
133
|
-
void uncompress_sliding_window(const uint32_t* data,
|
|
132
|
+
vector_u32<A> uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs, uint8_t lg_k, const A& allocator) const;
|
|
133
|
+
void uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
|
|
134
134
|
|
|
135
|
-
static size_t safe_length_for_compressed_pair_buf(
|
|
136
|
-
static size_t safe_length_for_compressed_window_buf(
|
|
137
|
-
static uint8_t determine_pseudo_phase(uint8_t lg_k,
|
|
135
|
+
static size_t safe_length_for_compressed_pair_buf(uint32_t k, uint32_t num_pairs, uint8_t num_base_bits);
|
|
136
|
+
static size_t safe_length_for_compressed_window_buf(uint32_t k);
|
|
137
|
+
static uint8_t determine_pseudo_phase(uint8_t lg_k, uint32_t c);
|
|
138
138
|
|
|
139
139
|
static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
|
|
140
|
-
static inline
|
|
140
|
+
static inline uint8_t golomb_choose_number_of_base_bits(uint32_t k, uint64_t count);
|
|
141
141
|
};
|
|
142
142
|
|
|
143
143
|
} /* namespace datasketches */
|